diff --git a/libs/libvpx/AUTHORS b/libs/libvpx/AUTHORS
index fcd5c534a8..04c2872432 100644
--- a/libs/libvpx/AUTHORS
+++ b/libs/libvpx/AUTHORS
@@ -3,11 +3,13 @@
 
 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
-Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
 Aℓex Converse <aconverse@google.com>
 Ahmad Sharif <asharif@google.com>
+Aleksey Vasenev <margtu-fivt@ya.ru>
+Alexander Potapenko <glider@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
+Alexandra Hájková <alexandra.khirnova@gmail.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -15,6 +17,7 @@ A.Mahfoodh <ab.mahfoodh@gmail.com>
 Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
+Andrew Lewis <andrewlewis@google.com>
 Andrew Russell <anrussell@google.com>
 Angie Chiang <angiebird@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
@@ -22,11 +25,14 @@ Attila Nagy <attilanagy@google.com>
 Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
+Cheng Chen <chengchen@google.com>
 chm <chm@rock-chips.com>
+Chris Cunningham <chcunningham@chromium.org>
 Christian Duvivier <cduvivier@google.com>
 Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
 Deb Mukherjee <debargha@google.com>
+Deepa K G <deepa.kg@ittiam.com>
 Dim Temp <dimtemp0@gmail.com>
 Dmitry Kovalev <dkovalev@google.com>
 Dragan Mrdjan <dmrdjan@mips.com>
@@ -37,17 +43,21 @@ Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
+Gabriel Marin <gmx@chromium.org>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
+Gregor Jasny <gjasny@gmail.com>
 Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
+Han Shen <shenhan@google.com>
 Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
+Ivan Krasin <krasin@chromium.org>
 Ivan Maltz <ivanmaltz@google.com>
 Jacek Caban <cjacek@gmail.com>
 Jacky Chen <jackychen@google.com>
@@ -61,6 +71,7 @@ Jean-Yves Avenard <jyavenard@mozilla.com>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
+Jerome Jiang <jianj@google.com>
 Jia Jia <jia.jia@linaro.org>
 Jian Zhou <zhoujian@google.com>
 Jim Bankoski <jimbankoski@google.com>
@@ -75,7 +86,9 @@ Joshua Litt <joshualitt@google.com>
 Julia Robson <juliamrobson@gmail.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
+Kaustubh Raste <kaustubh.raste@imgtec.com>
 KO Myung-Hun <komh@chollian.net>
+Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
 Linfeng Zhang <linfengz@google.com>
 Lou Quillio <louquillio@google.com>
@@ -91,8 +104,12 @@ Michael Kohler <michaelkohler@live.com>
 Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
+Min Chen <chenm003@gmail.com>
 Minghai Shang <minghai@google.com>
+Min Ye <yeemmi@google.com>
+Moriyoshi Koizumi <mozo@mozo.jp>
 Morton Jonuschat <yabawock@gmail.com>
+Nathan E. Egge <negge@mozilla.com>
 Nico Weber <thakis@chromium.org>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
@@ -101,16 +118,22 @@ Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
 Pengchong Jin <pengchong@google.com>
+Peter Boström <pbos@chromium.org>
+Peter Collingbourne <pcc@chromium.org>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
+Rafael de Lucena Valle <rafaeldelucena@gmail.com>
+Rahul Chaudhry <rahulchaudhry@google.com>
 Ralph Giles <giles@xiph.org>
+Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
 Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rsbultje@gmail.com>
 Rui Ueyama <ruiu@google.com>
 Sami Pietilä <samipietila@google.com>
+Sarah Parker <sarahparker@google.com>
 Sasi Inguva <isasi@google.com>
 Scott Graham <scottmg@chromium.org>
 Scott LaVarnway <slavarnway@google.com>
@@ -118,9 +141,11 @@ Sean McGovern <gseanmcg@gmail.com>
 Sergey Kolomenkin <kolomenkin@gmail.com>
 Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
+Shiyou Yin <yinshiyou-hf@loongson.cn>
 Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
+Sylvestre Ledru <sylvestre@mozilla.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
@@ -130,7 +155,10 @@ Thijs Vermeir <thijsvermeir@gmail.com>
 Tim Kopp <tkopp@google.com>
 Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
+Tristan Matthews <le.businessman@gmail.com>
+Urvang Joshi <urvang@google.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
+Vlad Tsyrklevich <vtsyrklevich@chromium.org>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
diff --git a/libs/libvpx/CHANGELOG b/libs/libvpx/CHANGELOG
index 795d395f96..2281394c8e 100644
--- a/libs/libvpx/CHANGELOG
+++ b/libs/libvpx/CHANGELOG
@@ -1,3 +1,44 @@
+2017-01-04 v1.7.0 "Mandarin Duck"
+  This release focused on high bit depth performance (10/12 bit) and vp9
+  encoding improvements.
+
+  - Upgrading:
+    This release is ABI incompatible due to new vp9 encoder features.
+
+    Frame parallel decoding for vp9 has been removed.
+
+  - Enhancements:
+    vp9 encoding supports additional threads with --row-mt. This can be greater
+    than the number of tiles.
+
+    Two new vp9 encoder options have been added:
+      --corpus-complexity
+      --tune-content=film
+
+    Additional tooling for respecting the vp9 "level" profiles has been added.
+
+  - Bug fixes:
+    A variety of fuzzing issues.
+    vp8 threading fix for ARM.
+    Codec control VP9_SET_SKIP_LOOP_FILTER fixed.
+    Reject invalid multi resolution configurations.
+
+2017-01-09 v1.6.1 "Long Tailed Duck"
+  This release improves upon the VP9 encoder and speeds up the encoding and
+  decoding processes.
+
+  - Upgrading:
+    This release is ABI compatible with 1.6.0.
+
+  - Enhancements:
+    Faster VP9 encoding and decoding.
+    High bit depth builds now provide similar speed for 8 bit encode and decode
+    for x86 targets. Other platforms and higher bit depth improvements are in
+    progress.
+
+  - Bug Fixes:
+    A variety of fuzzing issues.
+
 2016-07-20 v1.6.0 "Khaki Campbell Duck"
   This release improves upon the VP9 encoder and speeds up the encoding and
   decoding processes.
diff --git a/libs/libvpx/README b/libs/libvpx/README
index b9266ca267..73304dd62f 100644
--- a/libs/libvpx/README
+++ b/libs/libvpx/README
@@ -1,4 +1,4 @@
-README - 20 July 2016
+README - 24 January 2018
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
@@ -47,6 +47,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
   --help output of the configure script. As of this writing, the list of
   available targets is:
 
+    arm64-android-gcc
     arm64-darwin-gcc
     arm64-linux-gcc
     armv7-android-gcc
@@ -57,10 +58,13 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     armv7-win32-vs11
     armv7-win32-vs12
     armv7-win32-vs14
+    armv7-win32-vs15
     armv7s-darwin-gcc
     armv8-linux-gcc
     mips32-linux-gcc
     mips64-linux-gcc
+    ppc64-linux-gcc
+    ppc64le-linux-gcc
     sparc-solaris-gcc
     x86-android-gcc
     x86-darwin8-gcc
@@ -73,6 +77,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86-darwin13-gcc
     x86-darwin14-gcc
     x86-darwin15-gcc
+    x86-darwin16-gcc
     x86-iphonesimulator-gcc
     x86-linux-gcc
     x86-linux-icc
@@ -83,6 +88,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86-win32-vs11
     x86-win32-vs12
     x86-win32-vs14
+    x86-win32-vs15
     x86_64-android-gcc
     x86_64-darwin9-gcc
     x86_64-darwin10-gcc
@@ -91,6 +97,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86_64-darwin13-gcc
     x86_64-darwin14-gcc
     x86_64-darwin15-gcc
+    x86_64-darwin16-gcc
     x86_64-iphonesimulator-gcc
     x86_64-linux-gcc
     x86_64-linux-icc
@@ -100,6 +107,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86_64-win64-vs11
     x86_64-win64-vs12
     x86_64-win64-vs14
+    x86_64-win64-vs15
     generic-gnu
 
   The generic-gnu target, in conjunction with the CROSS environment variable,
diff --git a/libs/libvpx/args.c b/libs/libvpx/args.c
index bd1ede0388..a87b138b9d 100644
--- a/libs/libvpx/args.c
+++ b/libs/libvpx/args.c
@@ -13,6 +13,7 @@
 #include <limits.h>
 #include "args.h"
 
+#include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
 
 #if defined(__GNUC__) && __GNUC__
@@ -118,13 +119,13 @@ void arg_show_usage(FILE *fp, const struct arg_def *const *defs) {
 }
 
 unsigned int arg_parse_uint(const struct arg *arg) {
-  long int rawval;
+  uint32_t rawval;
   char *endptr;
 
-  rawval = strtol(arg->val, &endptr, 10);
+  rawval = (uint32_t)strtoul(arg->val, &endptr, 10);
 
   if (arg->val[0] != '\0' && endptr[0] == '\0') {
-    if (rawval >= 0 && rawval <= UINT_MAX) return (unsigned int)rawval;
+    if (rawval <= UINT_MAX) return rawval;
 
     die("Option %s: Value %ld out of range for unsigned int\n", arg->name,
         rawval);
@@ -135,10 +136,10 @@ unsigned int arg_parse_uint(const struct arg *arg) {
 }
 
 int arg_parse_int(const struct arg *arg) {
-  long int rawval;
+  int32_t rawval;
   char *endptr;
 
-  rawval = strtol(arg->val, &endptr, 10);
+  rawval = (int32_t)strtol(arg->val, &endptr, 10);
 
   if (arg->val[0] != '\0' && endptr[0] == '\0') {
     if (rawval >= INT_MIN && rawval <= INT_MAX) return (int)rawval;
diff --git a/libs/libvpx/build/make/Android.mk b/libs/libvpx/build/make/Android.mk
index 36120170e8..a88f90056e 100644
--- a/libs/libvpx/build/make/Android.mk
+++ b/libs/libvpx/build/make/Android.mk
@@ -64,6 +64,9 @@ CONFIG_DIR := $(LOCAL_PATH)/
 LIBVPX_PATH := $(LOCAL_PATH)/libvpx
 ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas
 ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL)
+ifneq ($(V),1)
+  qexec := @
+endif
 
 # Use the makefiles generated by upstream configure to determine which files to
 # build. Also set any architecture-specific flags.
@@ -71,7 +74,7 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
   include $(CONFIG_DIR)libs-armv7-android-gcc.mk
   LOCAL_ARM_MODE := arm
 else ifeq  ($(TARGET_ARCH_ABI),arm64-v8a)
-  include $(CONFIG_DIR)libs-armv8-android-gcc.mk
+  include $(CONFIG_DIR)libs-arm64-android-gcc.mk
   LOCAL_ARM_MODE := arm
 else ifeq ($(TARGET_ARCH_ABI),x86)
   include $(CONFIG_DIR)libs-x86-android-gcc.mk
@@ -101,10 +104,10 @@ LOCAL_CFLAGS := -O3
 # like x86inc.asm and x86_abi_support.asm
 LOCAL_ASMFLAGS := -I$(LIBVPX_PATH)
 
-.PRECIOUS: %.asm.s
-$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm
-	@mkdir -p $(dir $@)
-	@$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
+.PRECIOUS: %.asm.S
+$(ASM_CNV_PATH)/libvpx/%.asm.S: $(LIBVPX_PATH)/%.asm
+	$(qexec)mkdir -p $(dir $@)
+	$(qexec)$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
 
 # For building *_rtcd.h, which have rules in libs.mk
 TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN)))
@@ -132,7 +135,7 @@ endif
 
 # Pull out assembly files, splitting NEON from the rest.  This is
 # done to specify that the NEON assembly files use NEON assembler flags.
-# x86 assembly matches %.asm, arm matches %.asm.s
+# x86 assembly matches %.asm, arm matches %.asm.S
 
 # x86:
 
@@ -140,31 +143,44 @@ CODEC_SRCS_ASM_X86 = $(filter %.asm, $(CODEC_SRCS_UNIQUE))
 LOCAL_SRC_FILES += $(foreach file, $(CODEC_SRCS_ASM_X86), libvpx/$(file))
 
 # arm:
-CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE))
+CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.S, $(CODEC_SRCS_UNIQUE))
 CODEC_SRCS_ASM_ARM = $(foreach v, \
                      $(CODEC_SRCS_ASM_ARM_ALL), \
                      $(if $(findstring neon,$(v)),,$(v)))
-CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.s, \
-                         $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
+CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.S, \
+                         $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \
                          $(CODEC_SRCS_ASM_ARM))
 LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS)
 
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+  ASM_INCLUDES := vpx_dsp/arm/idct_neon.asm.S
   CODEC_SRCS_ASM_NEON = $(foreach v, \
                         $(CODEC_SRCS_ASM_ARM_ALL),\
                         $(if $(findstring neon,$(v)),$(v),))
-  CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.s, \
-                                $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
+  CODEC_SRCS_ASM_NEON := $(filter-out $(addprefix %, $(ASM_INCLUDES)), \
+                         $(CODEC_SRCS_ASM_NEON))
+  CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.S, \
+                                $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \
                                 $(CODEC_SRCS_ASM_NEON))
-  LOCAL_SRC_FILES += $(patsubst %.s, \
-                     %.s.neon, \
+  LOCAL_SRC_FILES += $(patsubst %.S, \
+                     %.S.neon, \
                      $(CODEC_SRCS_ASM_NEON_ADS2GAS))
+
+  NEON_ASM_TARGETS = $(patsubst %.S, \
+                     $(ASM_CNV_PATH)/libvpx/%.S, \
+                     $(CODEC_SRCS_ASM_NEON))
+# add a dependency to the full path to the ads2gas output to ensure the
+# includes are converted first.
+ifneq ($(strip $(NEON_ASM_TARGETS)),)
+$(NEON_ASM_TARGETS): $(addprefix $(ASM_CNV_PATH)/libvpx/, $(ASM_INCLUDES))
+endif
 endif
 
 LOCAL_CFLAGS += \
     -DHAVE_CONFIG_H=vpx_config.h \
     -I$(LIBVPX_PATH) \
-    -I$(ASM_CNV_PATH)
+    -I$(ASM_CNV_PATH) \
+    -I$(ASM_CNV_PATH)/libvpx
 
 LOCAL_MODULE := libvpx
 
@@ -185,7 +201,8 @@ endif
 $$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h
 $$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h
 
-ifneq ($(findstring $(TARGET_ARCH_ABI),x86 x86_64),)
+rtcd_dep_template_CONFIG_ASM_ABIS := x86 x86_64 armeabi-v7a
+ifneq ($$(findstring $(TARGET_ARCH_ABI),$$(rtcd_dep_template_CONFIG_ASM_ABIS)),)
 $$(rtcd_dep_template_SRCS): vpx_config.asm
 endif
 endef
@@ -195,16 +212,17 @@ $(eval $(call rtcd_dep_template))
 .PHONY: clean
 clean:
 	@echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]"
-	@$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
-	@$(RM) -r $(ASM_CNV_PATH)
-	@$(RM) $(CLEAN-OBJS)
+	$(qexec)$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
+	$(qexec)$(RM) -r $(ASM_CNV_PATH)
+	$(qexec)$(RM) $(CLEAN-OBJS)
 
 ifeq ($(ENABLE_SHARED),1)
+  LOCAL_CFLAGS += -fPIC
   include $(BUILD_SHARED_LIBRARY)
 else
   include $(BUILD_STATIC_LIBRARY)
 endif
 
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
-$(call import-module,cpufeatures)
+$(call import-module,android/cpufeatures)
 endif
diff --git a/libs/libvpx/build/make/Makefile b/libs/libvpx/build/make/Makefile
index 469eb74c3a..f6b3f0630f 100644
--- a/libs/libvpx/build/make/Makefile
+++ b/libs/libvpx/build/make/Makefile
@@ -90,7 +90,7 @@ all:
 
 .PHONY: clean
 clean::
-	rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.s.o=.asm.s)
+	rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.S.o=.asm.S)
 	rm -f $(CLEAN-OBJS)
 
 .PHONY: clean
@@ -124,6 +124,7 @@ ifeq ($(TOOLCHAIN), x86-os2-gcc)
 CFLAGS += -mstackrealign
 endif
 
+# x86[_64]
 $(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
 $(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
 $(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
@@ -138,6 +139,12 @@ $(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
 $(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
 $(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
 $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
+$(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
+$(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
+
+# POWER
+$(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
+$(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx
 
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
@@ -180,13 +187,13 @@ $(BUILD_PFX)%.asm.o: %.asm
 	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(AS) $(ASFLAGS) -o $@ $<
 
-$(BUILD_PFX)%.s.d: %.s
+$(BUILD_PFX)%.S.d: %.S
 	$(if $(quiet),@echo "    [DEP] $@")
 	$(qexec)mkdir -p $(dir $@)
 	$(qexec)$(SRC_PATH_BARE)/build/make/gen_asm_deps.sh \
             --build-pfx=$(BUILD_PFX) --depfile=$@ $(ASFLAGS) $< > $@
 
-$(BUILD_PFX)%.s.o: %.s
+$(BUILD_PFX)%.S.o: %.S
 	$(if $(quiet),@echo "    [AS] $@")
 	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(AS) $(ASFLAGS) -o $@ $<
@@ -198,8 +205,8 @@ $(BUILD_PFX)%.c.S: %.c
 	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(CC) -S $(CFLAGS) -o $@ $<
 
-.PRECIOUS: %.asm.s
-$(BUILD_PFX)%.asm.s: %.asm
+.PRECIOUS: %.asm.S
+$(BUILD_PFX)%.asm.S: %.asm
 	$(if $(quiet),@echo "    [ASM CONVERSION] $@")
 	$(qexec)mkdir -p $(dir $@)
 	$(qexec)$(ASM_CONVERSION) <$< >$@
diff --git a/libs/libvpx/build/make/ads2gas.pl b/libs/libvpx/build/make/ads2gas.pl
index 7272424af2..029cc4a56f 100755
--- a/libs/libvpx/build/make/ads2gas.pl
+++ b/libs/libvpx/build/make/ads2gas.pl
@@ -138,14 +138,6 @@ while (<STDIN>)
     s/DCD(.*)/.long $1/;
     s/DCB(.*)/.byte $1/;
 
-    # RN to .req
-    if (s/RN\s+([Rr]\d+|lr)/.req $1/)
-    {
-        print;
-        print "$comment_sub$comment\n" if defined $comment;
-        next;
-    }
-
     # Make function visible to linker, and make additional symbol with
     # prepended underscore
     s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
diff --git a/libs/libvpx/build/make/ads2gas_apple.pl b/libs/libvpx/build/make/ads2gas_apple.pl
index 1a9e105ba8..e1ae7b4f87 100755
--- a/libs/libvpx/build/make/ads2gas_apple.pl
+++ b/libs/libvpx/build/make/ads2gas_apple.pl
@@ -120,18 +120,6 @@ while (<STDIN>)
     s/DCD(.*)/.long $1/;
     s/DCB(.*)/.byte $1/;
 
-    # Build a hash of all the register - alias pairs.
-    if (s/(.*)RN(.*)/$1 .req $2/g)
-    {
-        $register_aliases{trim($1)} = trim($2);
-        next;
-    }
-
-    while (($key, $value) = each(%register_aliases))
-    {
-        s/\b$key\b/$value/g;
-    }
-
     # Make function visible to linker, and make additional symbol with
     # prepended underscore
     s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/;
diff --git a/libs/libvpx/build/make/configure.sh b/libs/libvpx/build/make/configure.sh
index 745e10ddc8..683b430374 100644
--- a/libs/libvpx/build/make/configure.sh
+++ b/libs/libvpx/build/make/configure.sh
@@ -332,20 +332,10 @@ EOF
 }
 
 check_cflags() {
-  log check_cflags "$@"
-
-  case "$CC" in
-    *gcc*|*clang)
-      check_cc -Werror "$@" <<EOF
+ log check_cflags "$@"
+ check_cc -Werror "$@" <<EOF
 int x;
 EOF
-      ;;
-    *)
-      check_cc "$@" <<EOF
-int x;
-EOF
-      ;;
-    esac
 }
 
 check_cxxflags() {
@@ -413,6 +403,23 @@ check_gcc_machine_option() {
   fi
 }
 
+# tests for -m$2, -m$3, -m$4... toggling the feature given in $1.
+check_gcc_machine_options() {
+  feature="$1"
+  shift
+  flags="-m$1"
+  shift
+  for opt in $*; do
+    flags="$flags -m$opt"
+  done
+
+  if enabled gcc && ! disabled "$feature" && ! check_cflags $flags; then
+    RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature "
+  else
+    soft_enable "$feature"
+  fi
+}
+
 write_common_config_banner() {
   print_webm_license config.mk "##" ""
   echo '# This file automatically generated by configure. Do not edit!' >> config.mk
@@ -645,7 +652,7 @@ setup_gnu_toolchain() {
   AS=${AS:-${CROSS}as}
   STRIP=${STRIP:-${CROSS}strip}
   NM=${NM:-${CROSS}nm}
-  AS_SFX=.s
+  AS_SFX=.S
   EXE_SFX=
 }
 
@@ -684,7 +691,6 @@ check_xcode_minimum_version() {
 process_common_toolchain() {
   if [ -z "$toolchain" ]; then
     gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
-
     # detect tgt_isa
     case "$gcctarget" in
       aarch64*)
@@ -707,6 +713,18 @@ process_common_toolchain() {
       *sparc*)
         tgt_isa=sparc
         ;;
+      power*64*-*)
+        tgt_isa=ppc64
+        ;;
+      power*)
+        tgt_isa=ppc
+        ;;
+      *mips64el*)
+        tgt_isa=mips64
+        ;;
+      *mips32el*)
+        tgt_isa=mips32
+        ;;
     esac
 
     # detect tgt_os
@@ -735,9 +753,16 @@ process_common_toolchain() {
         tgt_isa=x86_64
         tgt_os=darwin15
         ;;
+      *darwin16*)
+        tgt_isa=x86_64
+        tgt_os=darwin16
+        ;;
       x86_64*mingw32*)
         tgt_os=win64
         ;;
+      x86_64*cygwin*)
+        tgt_os=win64
+        ;;
       *mingw32*|*cygwin*)
         [ -z "$tgt_isa" ] && tgt_isa=x86
         tgt_os=win32
@@ -785,6 +810,9 @@ process_common_toolchain() {
     mips*)
       enable_feature mips
       ;;
+    ppc*)
+      enable_feature ppc
+      ;;
   esac
 
   # PIC is probably what we want when building shared libs
@@ -853,6 +881,10 @@ process_common_toolchain() {
       add_cflags  "-mmacosx-version-min=10.11"
       add_ldflags "-mmacosx-version-min=10.11"
       ;;
+    *-darwin16-*)
+      add_cflags  "-mmacosx-version-min=10.12"
+      add_ldflags "-mmacosx-version-min=10.12"
+      ;;
     *-iphonesimulator-*)
       add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
       add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
@@ -936,7 +968,7 @@ EOF
           ;;
         vs*)
           asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
-          AS_SFX=.s
+          AS_SFX=.S
           msvs_arch_dir=arm-msvs
           disable_feature multithread
           disable_feature unit_tests
@@ -946,6 +978,7 @@ EOF
             # only "AppContainerApplication" which requires an AppxManifest.
             # Therefore disable the examples, just build the library.
             disable_feature examples
+            disable_feature tools
           fi
           ;;
         rvct)
@@ -1044,7 +1077,7 @@ EOF
           STRIP="$(${XCRUN_FIND} strip)"
           NM="$(${XCRUN_FIND} nm)"
           RANLIB="$(${XCRUN_FIND} ranlib)"
-          AS_SFX=.s
+          AS_SFX=.S
           LD="${CXX:-$(${XCRUN_FIND} ld)}"
 
           # ASFLAGS is written here instead of using check_add_asflags
@@ -1153,10 +1186,20 @@ EOF
         fi
       fi
 
+      if enabled mmi; then
+        tgt_isa=loongson3a
+        check_add_ldflags -march=loongson3a
+      fi
+
       check_add_cflags -march=${tgt_isa}
       check_add_asflags -march=${tgt_isa}
       check_add_asflags -KPIC
       ;;
+    ppc*)
+      link_with_cc=gcc
+      setup_gnu_toolchain
+      check_gcc_machine_option "vsx"
+      ;;
     x86*)
       case  ${tgt_os} in
         win*)
@@ -1211,6 +1254,13 @@ EOF
           AS=msvs
           msvs_arch_dir=x86-msvs
           vc_version=${tgt_cc##vs}
+          case $vc_version in
+            7|8|9|10|11|12|13|14)
+              echo "${tgt_cc} does not support avx512, disabling....."
+              RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 "
+              soft_disable avx512
+              ;;
+          esac
           case $vc_version in
             7|8|9|10)
               echo "${tgt_cc} does not support avx/avx2, disabling....."
@@ -1255,9 +1305,18 @@ EOF
         elif disabled $ext; then
           disable_exts="yes"
         else
-          # use the shortened version for the flag: sse4_1 -> sse4
-          check_gcc_machine_option ${ext%_*} $ext
+          if [ "$ext" = "avx512" ]; then
+            check_gcc_machine_options $ext avx512f avx512cd avx512bw avx512dq avx512vl
+          else
+            # use the shortened version for the flag: sse4_1 -> sse4
+            check_gcc_machine_option ${ext%_*} $ext
+          fi
         fi
+
+        # https://bugs.chromium.org/p/webm/issues/detail?id=1464
+        # The assembly optimizations for vpx_sub_pixel_variance do not link with
+        # gcc 6.
+        enabled sse2 && soft_enable pic
       done
 
       if enabled external_build; then
@@ -1282,7 +1341,6 @@ EOF
         esac
         log_echo "  using $AS"
       fi
-      [ "${AS##*/}" = nasm ] && add_asflags -Ox
       AS_SFX=.asm
       case  ${tgt_os} in
         win32)
@@ -1291,7 +1349,7 @@ EOF
           EXE_SFX=.exe
           ;;
         win64)
-          add_asflags -f x64
+          add_asflags -f win64
           enabled debug && add_asflags -g cv8
           EXE_SFX=.exe
           ;;
@@ -1425,6 +1483,10 @@ EOF
           echo "msa optimizations are available only for little endian platforms"
           disable_feature msa
         fi
+        if enabled mmi; then
+          echo "mmi optimizations are available only for little endian platforms"
+          disable_feature mmi
+        fi
       fi
       ;;
   esac
diff --git a/libs/libvpx/build/make/gen_msvs_sln.sh b/libs/libvpx/build/make/gen_msvs_sln.sh
index 7d5f468109..401223a0b5 100755
--- a/libs/libvpx/build/make/gen_msvs_sln.sh
+++ b/libs/libvpx/build/make/gen_msvs_sln.sh
@@ -25,7 +25,7 @@ files.
 Options:
     --help                      Print this message
     --out=outfile               Redirect output to a file
-    --ver=version               Version (7,8,9,10,11,12,14) of visual studio to generate for
+    --ver=version               Version (7,8,9,10,11,12,14,15) of visual studio to generate for
     --target=isa-os-cc          Target specifier
 EOF
     exit 1
@@ -215,7 +215,7 @@ for opt in "$@"; do
     ;;
     --ver=*) vs_ver="$optval"
              case $optval in
-             10|11|12|14)
+             10|11|12|14|15)
              ;;
              *) die Unrecognized Visual Studio Version in $opt
              ;;
@@ -240,9 +240,12 @@ case "${vs_ver:-10}" in
     12) sln_vers="12.00"
        sln_vers_str="Visual Studio 2013"
     ;;
-    14) sln_vers="14.00"
+    14) sln_vers="12.00"
        sln_vers_str="Visual Studio 2015"
     ;;
+    15) sln_vers="12.00"
+       sln_vers_str="Visual Studio 2017"
+    ;;
 esac
 sfx=vcxproj
 
diff --git a/libs/libvpx/build/make/gen_msvs_vcxproj.sh b/libs/libvpx/build/make/gen_msvs_vcxproj.sh
index e98611d102..171d0b99b6 100755
--- a/libs/libvpx/build/make/gen_msvs_vcxproj.sh
+++ b/libs/libvpx/build/make/gen_msvs_vcxproj.sh
@@ -34,7 +34,7 @@ Options:
     --name=project_name         Name of the project (required)
     --proj-guid=GUID            GUID to use for the project
     --module-def=filename       File containing export definitions (for DLLs)
-    --ver=version               Version (10,11,12,14) of visual studio to generate for
+    --ver=version               Version (10,11,12,14,15) of visual studio to generate for
     --src-path-bare=dir         Path to root of source tree
     -Ipath/to/include           Additional include directories
     -DFLAG[=value]              Preprocessor macros to define
@@ -82,7 +82,7 @@ generate_filter() {
                        | sed -e "s,$src_path_bare,," \
                              -e 's/^[\./]\+//g' -e 's,[:/ ],_,g')
 
-                if ([ "$pat" == "asm" ] || [ "$pat" == "s" ]) && $asm_use_custom_step; then
+                if ([ "$pat" == "asm" ] || [ "$pat" == "s" ] || [ "$pat" == "S" ]) && $asm_use_custom_step; then
                     # Avoid object file name collisions, i.e. vpx_config.c and
                     # vpx_config.asm produce the same object file without
                     # this additional suffix.
@@ -168,7 +168,7 @@ for opt in "$@"; do
         --ver=*)
             vs_ver="$optval"
             case "$optval" in
-                10|11|12|14)
+                10|11|12|14|15)
                 ;;
                 *) die Unrecognized Visual Studio Version in $opt
                 ;;
@@ -203,7 +203,7 @@ for opt in "$@"; do
             # The paths in file_list are fixed outside of the loop.
             file_list[${#file_list[@]}]="$opt"
             case "$opt" in
-                 *.asm|*.s) uses_asm=true
+                 *.asm|*.[Ss]) uses_asm=true
                  ;;
             esac
         ;;
@@ -218,7 +218,7 @@ guid=${guid:-`generate_uuid`}
 asm_use_custom_step=false
 uses_asm=${uses_asm:-false}
 case "${vs_ver:-11}" in
-    10|11|12|14)
+    10|11|12|14|15)
        asm_use_custom_step=$uses_asm
     ;;
 esac
@@ -347,6 +347,9 @@ generate_vcxproj() {
             if [ "$vs_ver" = "14" ]; then
                 tag_content PlatformToolset v140
             fi
+            if [ "$vs_ver" = "15" ]; then
+                tag_content PlatformToolset v141
+            fi
             tag_content CharacterSet Unicode
             if [ "$config" = "Release" ]; then
                 tag_content WholeProgramOptimization true
@@ -452,7 +455,7 @@ generate_vcxproj() {
     done
 
     open_tag ItemGroup
-    generate_filter "Source Files"   "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s"
+    generate_filter "Source Files"   "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s;S"
     close_tag ItemGroup
     open_tag ItemGroup
     generate_filter "Header Files"   "h;hm;inl;inc;xsd"
diff --git a/libs/libvpx/build/make/iosbuild.sh b/libs/libvpx/build/make/iosbuild.sh
index c703f22b0c..365a8c0130 100755
--- a/libs/libvpx/build/make/iosbuild.sh
+++ b/libs/libvpx/build/make/iosbuild.sh
@@ -35,8 +35,8 @@ ARM_TARGETS="arm64-darwin-gcc
              armv7s-darwin-gcc"
 SIM_TARGETS="x86-iphonesimulator-gcc
              x86_64-iphonesimulator-gcc"
-OSX_TARGETS="x86-darwin15-gcc
-             x86_64-darwin15-gcc"
+OSX_TARGETS="x86-darwin16-gcc
+             x86_64-darwin16-gcc"
 TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"
 
 # Configures for the target specified by $1, and invokes make with the dist
@@ -271,7 +271,7 @@ cat << EOF
     --help: Display this message and exit.
     --enable-shared: Build a dynamic framework for use on iOS 8 or later.
     --extra-configure-args <args>: Extra args to pass when configuring libvpx.
-    --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86
+    --macosx: Uses darwin16 targets instead of iphonesimulator targets for x86
               and x86_64. Allows linking to framework when builds target MacOSX
               instead of iOS.
     --preserve-build-output: Do not delete the build directory.
diff --git a/libs/libvpx/build/make/rtcd.pl b/libs/libvpx/build/make/rtcd.pl
index 9e746c46d0..68e92b52cc 100755
--- a/libs/libvpx/build/make/rtcd.pl
+++ b/libs/libvpx/build/make/rtcd.pl
@@ -1,4 +1,13 @@
 #!/usr/bin/env perl
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
 
 no strict 'refs';
 use warnings;
@@ -200,6 +209,7 @@ sub filter {
 sub common_top() {
   my $include_guard = uc($opts{sym})."_H_";
   print <<EOF;
+// This file is generated. Do not edit.
 #ifndef ${include_guard}
 #define ${include_guard}
 
@@ -335,6 +345,36 @@ EOF
   common_bottom;
 }
 
+sub ppc() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/ppc.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = ppc_simd_caps();
+    (void)flags;
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
 sub unoptimized() {
   determine_indirection "c";
   common_top;
@@ -361,10 +401,10 @@ EOF
 
 &require("c");
 if ($opts{arch} eq 'x86') {
-  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/);
   x86;
 } elsif ($opts{arch} eq 'x86_64') {
-  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/);
   @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
   &require(@REQUIRES);
   x86;
@@ -381,6 +421,10 @@ if ($opts{arch} eq 'x86') {
       @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
       last;
     }
+    if (/HAVE_MMI=yes/) {
+      @ALL_ARCHS = filter("$opts{arch}", qw/mmi/);
+      last;
+    }
   }
   close CONFIG_FILE;
   mips;
@@ -390,6 +434,9 @@ if ($opts{arch} eq 'x86') {
 } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
   @ALL_ARCHS = filter(qw/neon/);
   arm;
+} elsif ($opts{arch} =~ /^ppc/ ) {
+  @ALL_ARCHS = filter(qw/vsx/);
+  ppc;
 } else {
   unoptimized;
 }
diff --git a/libs/libvpx/build/make/version.sh b/libs/libvpx/build/make/version.sh
index 6967527771..f36ede10f2 100755
--- a/libs/libvpx/build/make/version.sh
+++ b/libs/libvpx/build/make/version.sh
@@ -60,6 +60,7 @@ if [ ${bare} ]; then
     echo "${changelog_version}${git_version_id}" > $$.tmp
 else
     cat<<EOF>$$.tmp
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  $major_version
 #define VERSION_MINOR  $minor_version
 #define VERSION_PATCH  $patch_version
diff --git a/libs/libvpx/configure b/libs/libvpx/configure
index 95af2e395e..e5a74c6f2a 100755
--- a/libs/libvpx/configure
+++ b/libs/libvpx/configure
@@ -22,6 +22,7 @@ show_help(){
 Advanced options:
   ${toggle_libs}                  libraries
   ${toggle_examples}              examples
+  ${toggle_tools}                 tools
   ${toggle_docs}                  documentation
   ${toggle_unit_tests}            unit tests
   ${toggle_decode_perf_tests}     build decoder perf tests with unit tests
@@ -108,10 +109,13 @@ all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-win32-vs11"
 all_platforms="${all_platforms} armv7-win32-vs12"
 all_platforms="${all_platforms} armv7-win32-vs14"
+all_platforms="${all_platforms} armv7-win32-vs15"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} armv8-linux-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} mips64-linux-gcc"
+all_platforms="${all_platforms} ppc64-linux-gcc"
+all_platforms="${all_platforms} ppc64le-linux-gcc"
 all_platforms="${all_platforms} sparc-solaris-gcc"
 all_platforms="${all_platforms} x86-android-gcc"
 all_platforms="${all_platforms} x86-darwin8-gcc"
@@ -124,6 +128,7 @@ all_platforms="${all_platforms} x86-darwin12-gcc"
 all_platforms="${all_platforms} x86-darwin13-gcc"
 all_platforms="${all_platforms} x86-darwin14-gcc"
 all_platforms="${all_platforms} x86-darwin15-gcc"
+all_platforms="${all_platforms} x86-darwin16-gcc"
 all_platforms="${all_platforms} x86-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
@@ -134,6 +139,7 @@ all_platforms="${all_platforms} x86-win32-vs10"
 all_platforms="${all_platforms} x86-win32-vs11"
 all_platforms="${all_platforms} x86-win32-vs12"
 all_platforms="${all_platforms} x86-win32-vs14"
+all_platforms="${all_platforms} x86-win32-vs15"
 all_platforms="${all_platforms} x86_64-android-gcc"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
@@ -142,6 +148,7 @@ all_platforms="${all_platforms} x86_64-darwin12-gcc"
 all_platforms="${all_platforms} x86_64-darwin13-gcc"
 all_platforms="${all_platforms} x86_64-darwin14-gcc"
 all_platforms="${all_platforms} x86_64-darwin15-gcc"
+all_platforms="${all_platforms} x86_64-darwin16-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
@@ -151,22 +158,26 @@ all_platforms="${all_platforms} x86_64-win64-vs10"
 all_platforms="${all_platforms} x86_64-win64-vs11"
 all_platforms="${all_platforms} x86_64-win64-vs12"
 all_platforms="${all_platforms} x86_64-win64-vs14"
+all_platforms="${all_platforms} x86_64-win64-vs15"
 all_platforms="${all_platforms} generic-gnu"
 
 # all_targets is a list of all targets that can be configured
 # note that these should be in dependency order for now.
-all_targets="libs examples docs"
+all_targets="libs examples tools docs"
 
 # all targets available are enabled, by default.
 for t in ${all_targets}; do
     [ -f "${source_path}/${t}.mk" ] && enable_feature ${t}
 done
 
+if ! diff --version >/dev/null; then
+  die "diff missing: Try installing diffutils via your package manager."
+fi
+
 if ! perl --version >/dev/null; then
     die "Perl is required to build"
 fi
 
-
 if [ "`cd \"${source_path}\" && pwd`" != "`pwd`" ]; then
   # test to see if source_path already configured
   if [ -f "${source_path}/vpx_config.h" ]; then
@@ -222,6 +233,7 @@ ARCH_LIST="
     mips
     x86
     x86_64
+    ppc
 "
 ARCH_EXT_LIST_X86="
     mmx
@@ -232,7 +244,13 @@ ARCH_EXT_LIST_X86="
     sse4_1
     avx
     avx2
+    avx512
 "
+
+ARCH_EXT_LIST_LOONGSON="
+    mmi
+"
+
 ARCH_EXT_LIST="
     neon
     neon_asm
@@ -243,6 +261,10 @@ ARCH_EXT_LIST="
     mips64
 
     ${ARCH_EXT_LIST_X86}
+
+    vsx
+
+    ${ARCH_EXT_LIST_LOONGSON}
 "
 HAVE_LIST="
     ${ARCH_EXT_LIST}
@@ -254,7 +276,6 @@ EXPERIMENT_LIST="
     spatial_svc
     fp_mb_stats
     emulate_hardware
-    misc_fixes
 "
 CONFIG_LIST="
     dependency_tracking
@@ -309,6 +330,7 @@ CONFIG_LIST="
     better_hw_compatibility
     experimental
     size_limit
+    always_adjust_bpm
     ${EXPERIMENT_LIST}
 "
 CMDLINE_SELECT="
@@ -331,6 +353,7 @@ CMDLINE_SELECT="
 
     libs
     examples
+    tools
     docs
     libc
     as
@@ -367,6 +390,7 @@ CMDLINE_SELECT="
     better_hw_compatibility
     vp9_highbitdepth
     experimental
+    always_adjust_bpm
 "
 
 process_cmdline() {
@@ -476,7 +500,7 @@ EOF
     #
     # Write makefiles for all enabled targets
     #
-    for tgt in libs examples docs solution; do
+    for tgt in libs examples tools docs solution; do
         tgt_fn="$tgt-$toolchain.mk"
 
         if enabled $tgt; then
@@ -568,6 +592,7 @@ process_toolchain() {
         check_add_cflags -Wdeclaration-after-statement
         check_add_cflags -Wdisabled-optimization
         check_add_cflags -Wfloat-conversion
+        check_add_cflags -Wparentheses-equality
         check_add_cflags -Wpointer-arith
         check_add_cflags -Wtype-limits
         check_add_cflags -Wcast-qual
@@ -575,12 +600,20 @@ process_toolchain() {
         check_add_cflags -Wimplicit-function-declaration
         check_add_cflags -Wuninitialized
         check_add_cflags -Wunused
+        # -Wextra has some tricky cases. Rather than fix them all now, get the
+        # flag for as many files as possible and fix the remaining issues
+        # piecemeal.
+        # https://bugs.chromium.org/p/webm/issues/detail?id=1069
+        check_add_cflags -Wextra
         # check_add_cflags also adds to cxxflags. gtest does not do well with
         # -Wundef so add it explicitly to CFLAGS only.
         check_cflags -Wundef && add_cflags_only -Wundef
         if enabled mips || [ -z "${INLINE}" ]; then
           enabled extra_warnings || check_add_cflags -Wno-unused-function
         fi
+        # Avoid this warning for third_party C++ sources. Some reorganization
+        # would be needed to apply this only to test/*.cc.
+        check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32
     fi
 
     if enabled icc; then
@@ -632,7 +665,7 @@ process_toolchain() {
              gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
              enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
              all_targets="${all_targets} solution"
-             INLINE="__forceinline"
+             INLINE="__inline"
         ;;
     esac
 
diff --git a/libs/libvpx/examples.mk b/libs/libvpx/examples.mk
index cc7fb1ddcf..38c4d75c51 100644
--- a/libs/libvpx/examples.mk
+++ b/libs/libvpx/examples.mk
@@ -76,6 +76,7 @@ vpxdec.SRCS                 += tools_common.c tools_common.h
 vpxdec.SRCS                 += y4menc.c y4menc.h
 ifeq ($(CONFIG_LIBYUV),yes)
   vpxdec.SRCS                 += $(LIBYUV_SRCS)
+  $(BUILD_PFX)third_party/libyuv/%.cc.o: CXXFLAGS += -Wno-unused-parameter
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
   vpxdec.SRCS                 += $(LIBWEBM_COMMON_SRCS)
diff --git a/libs/libvpx/examples/vp8_multi_resolution_encoder.c b/libs/libvpx/examples/vp8_multi_resolution_encoder.c
index 65308a0bd0..b14b1ff397 100644
--- a/libs/libvpx/examples/vp8_multi_resolution_encoder.c
+++ b/libs/libvpx/examples/vp8_multi_resolution_encoder.c
@@ -151,7 +151,7 @@ static void write_ivf_frame_header(FILE *outfile,
   if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return;
 
   pts = pkt->data.frame.pts;
-  mem_put_le32(header, pkt->data.frame.sz);
+  mem_put_le32(header, (int)pkt->data.frame.sz);
   mem_put_le32(header + 4, pts & 0xFFFFFFFF);
   mem_put_le32(header + 8, pts >> 32);
 
@@ -190,7 +190,7 @@ static void set_temporal_layer_pattern(int num_temporal_layers,
       cfg->ts_layer_id[0] = 0;
       cfg->ts_layer_id[1] = 1;
       // Use 60/40 bit allocation as example.
-      cfg->ts_target_bitrate[0] = 0.6f * bitrate;
+      cfg->ts_target_bitrate[0] = (int)(0.6f * bitrate);
       cfg->ts_target_bitrate[1] = bitrate;
 
       /* 0=L, 1=GF */
@@ -240,9 +240,9 @@ static void set_temporal_layer_pattern(int num_temporal_layers,
       cfg->ts_layer_id[1] = 2;
       cfg->ts_layer_id[2] = 1;
       cfg->ts_layer_id[3] = 2;
-      // Use 40/20/40 bit allocation as example.
-      cfg->ts_target_bitrate[0] = 0.4f * bitrate;
-      cfg->ts_target_bitrate[1] = 0.6f * bitrate;
+      // Use 45/20/35 bit allocation as example.
+      cfg->ts_target_bitrate[0] = (int)(0.45f * bitrate);
+      cfg->ts_target_bitrate[1] = (int)(0.65f * bitrate);
       cfg->ts_target_bitrate[2] = bitrate;
 
       /* 0=L, 1=GF, 2=ARF */
@@ -294,8 +294,8 @@ int main(int argc, char **argv) {
   vpx_codec_err_t res[NUM_ENCODERS];
 
   int i;
-  long width;
-  long height;
+  int width;
+  int height;
   int length_frame;
   int frame_avail;
   int got_data;
@@ -347,9 +347,9 @@ int main(int argc, char **argv) {
 
   printf("Using %s\n", vpx_codec_iface_name(interface));
 
-  width = strtol(argv[1], NULL, 0);
-  height = strtol(argv[2], NULL, 0);
-  framerate = strtol(argv[3], NULL, 0);
+  width = (int)strtol(argv[1], NULL, 0);
+  height = (int)strtol(argv[2], NULL, 0);
+  framerate = (int)strtol(argv[3], NULL, 0);
 
   if (width < 16 || width % 2 || height < 16 || height % 2)
     die("Invalid resolution: %ldx%ld", width, height);
@@ -371,12 +371,13 @@ int main(int argc, char **argv) {
 
   // Bitrates per spatial layer: overwrite default rates above.
   for (i = 0; i < NUM_ENCODERS; i++) {
-    target_bitrate[i] = strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0);
+    target_bitrate[i] = (int)strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0);
   }
 
   // Temporal layers per spatial layers: overwrite default settings above.
   for (i = 0; i < NUM_ENCODERS; i++) {
-    num_temporal_layers[i] = strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
+    num_temporal_layers[i] =
+        (int)strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
     if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3)
       die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n",
           num_temporal_layers);
@@ -391,9 +392,9 @@ int main(int argc, char **argv) {
     downsampled_input[i] = fopen(filename, "wb");
   }
 
-  key_frame_insert = strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0);
+  key_frame_insert = (int)strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0);
 
-  show_psnr = strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0);
+  show_psnr = (int)strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0);
 
   /* Populate default encoder configuration */
   for (i = 0; i < NUM_ENCODERS; i++) {
@@ -460,7 +461,7 @@ int main(int argc, char **argv) {
 
   // Set the number of threads per encode/spatial layer.
   // (1, 1, 1) means no encoder threading.
-  cfg[0].g_threads = 2;
+  cfg[0].g_threads = 1;
   cfg[1].g_threads = 1;
   cfg[2].g_threads = 1;
 
@@ -469,7 +470,7 @@ int main(int argc, char **argv) {
     if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
       die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
 
-  if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
+  if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w)
     read_frame_p = read_frame;
   else
     read_frame_p = read_frame_by_row;
@@ -507,9 +508,11 @@ int main(int argc, char **argv) {
 
   /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */
   /* Enable denoising for the highest-resolution encoder. */
-  if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 4))
+  if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1))
     die_codec(&codec[0], "Failed to set noise_sensitivity");
-  for (i = 1; i < NUM_ENCODERS; i++) {
+  if (vpx_codec_control(&codec[1], VP8E_SET_NOISE_SENSITIVITY, 1))
+    die_codec(&codec[1], "Failed to set noise_sensitivity");
+  for (i = 2; i < NUM_ENCODERS; i++) {
     if (vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0))
       die_codec(&codec[i], "Failed to set noise_sensitivity");
   }
@@ -556,7 +559,8 @@ int main(int argc, char **argv) {
         /* Write out down-sampled input. */
         length_frame = cfg[i].g_w * cfg[i].g_h * 3 / 2;
         if (fwrite(raw[i].planes[0], 1, length_frame,
-                   downsampled_input[NUM_ENCODERS - i - 1]) != length_frame) {
+                   downsampled_input[NUM_ENCODERS - i - 1]) !=
+            (unsigned int)length_frame) {
           return EXIT_FAILURE;
         }
       }
@@ -617,10 +621,6 @@ int main(int argc, char **argv) {
             break;
           default: break;
         }
-        printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT &&
-                       (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)
-                   ? "K"
-                   : "");
         fflush(stdout);
       }
     }
@@ -661,7 +661,6 @@ int main(int argc, char **argv) {
       write_ivf_file_header(outfile[i], &cfg[i], frame_cnt - 1);
     fclose(outfile[i]);
   }
-  printf("\n");
 
   return EXIT_SUCCESS;
 }
diff --git a/libs/libvpx/examples/vp9_spatial_svc_encoder.c b/libs/libvpx/examples/vp9_spatial_svc_encoder.c
index cecdce0804..0987cbfb85 100644
--- a/libs/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/libs/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -84,6 +84,8 @@ static const arg_def_t speed_arg =
     ARG_DEF("sp", "speed", 1, "speed configuration");
 static const arg_def_t aqmode_arg =
     ARG_DEF("aq", "aqmode", 1, "aq-mode off/on");
+static const arg_def_t bitrates_arg =
+    ARG_DEF("bl", "bitrates", 1, "bitrates[sl * num_tl + tl]");
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
@@ -124,6 +126,7 @@ static const arg_def_t *svc_args[] = { &frames_arg,
 #endif
                                        &speed_arg,
                                        &rc_end_usage_arg,
+                                       &bitrates_arg,
                                        NULL };
 
 static const uint32_t default_frames_to_skip = 0;
@@ -165,7 +168,7 @@ void usage_exit(void) {
 static void parse_command_line(int argc, const char **argv_,
                                AppInput *app_input, SvcContext *svc_ctx,
                                vpx_codec_enc_cfg_t *enc_cfg) {
-  struct arg arg = { 0 };
+  struct arg arg;
   char **argv = NULL;
   char **argi = NULL;
   char **argj = NULL;
@@ -250,6 +253,9 @@ static void parse_command_line(int argc, const char **argv_,
     } else if (arg_match(&arg, &scale_factors_arg, argi)) {
       snprintf(string_options, sizeof(string_options), "%s scale-factors=%s",
                string_options, arg.val);
+    } else if (arg_match(&arg, &bitrates_arg, argi)) {
+      snprintf(string_options, sizeof(string_options), "%s bitrates=%s",
+               string_options, arg.val);
     } else if (arg_match(&arg, &passes_arg, argi)) {
       passes = arg_parse_uint(&arg);
       if (passes < 1 || passes > 2) {
@@ -417,7 +423,6 @@ static void set_rate_control_stats(struct RateControlStats *rc,
   for (sl = 0; sl < cfg->ss_number_layers; ++sl) {
     for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
       const int layer = sl * cfg->ts_number_layers + tl;
-      const int tlayer0 = sl * cfg->ts_number_layers;
       if (cfg->ts_number_layers == 1)
         rc->layer_framerate[layer] = framerate;
       else
@@ -428,8 +433,8 @@ static void set_rate_control_stats(struct RateControlStats *rc,
                       cfg->layer_target_bitrate[layer - 1]) /
             (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]);
       } else {
-        rc->layer_pfb[tlayer0] = 1000.0 * cfg->layer_target_bitrate[tlayer0] /
-                                 rc->layer_framerate[tlayer0];
+        rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] /
+                               rc->layer_framerate[layer];
       }
       rc->layer_input_frames[layer] = 0;
       rc->layer_enc_frames[layer] = 0;
@@ -449,12 +454,13 @@ static void printout_rate_control_summary(struct RateControlStats *rc,
                                           vpx_codec_enc_cfg_t *cfg,
                                           int frame_cnt) {
   unsigned int sl, tl;
-  int tot_num_frames = 0;
   double perc_fluctuation = 0.0;
+  int tot_num_frames = 0;
   printf("Total number of processed frames: %d\n\n", frame_cnt - 1);
   printf("Rate control layer stats for sl%d tl%d layer(s):\n\n",
          cfg->ss_number_layers, cfg->ts_number_layers);
   for (sl = 0; sl < cfg->ss_number_layers; ++sl) {
+    tot_num_frames = 0;
     for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
       const int layer = sl * cfg->ts_number_layers + tl;
       const int num_dropped =
@@ -462,7 +468,7 @@ static void printout_rate_control_summary(struct RateControlStats *rc,
               ? (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer])
               : (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer] -
                  1);
-      if (!sl) tot_num_frames += rc->layer_input_frames[layer];
+      tot_num_frames += rc->layer_input_frames[layer];
       rc->layer_encoding_bitrate[layer] = 0.001 * rc->layer_framerate[layer] *
                                           rc->layer_encoding_bitrate[layer] /
                                           tot_num_frames;
@@ -503,7 +509,7 @@ static void printout_rate_control_summary(struct RateControlStats *rc,
 }
 
 vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
-                                       uint32_t sizes[8], int *count) {
+                                       uint64_t sizes[8], int *count) {
   // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
   // it is a super frame index. If the last byte of real video compression
   // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
@@ -600,9 +606,9 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
 }
 
 int main(int argc, const char **argv) {
-  AppInput app_input = { 0 };
+  AppInput app_input;
   VpxVideoWriter *writer = NULL;
-  VpxVideoInfo info = { 0 };
+  VpxVideoInfo info;
   vpx_codec_ctx_t codec;
   vpx_codec_enc_cfg_t enc_cfg;
   SvcContext svc_ctx;
@@ -620,7 +626,7 @@ int main(int argc, const char **argv) {
   struct RateControlStats rc;
   vpx_svc_layer_id_t layer_id;
   vpx_svc_ref_frame_config_t ref_frame_config;
-  int sl, tl;
+  unsigned int sl, tl;
   double sum_bitrate = 0.0;
   double sum_bitrate2 = 0.0;
   double framerate = 30.0;
@@ -634,8 +640,9 @@ int main(int argc, const char **argv) {
 
 // Allocate image buffer
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (!vpx_img_alloc(&raw, enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420
-                                                          : VPX_IMG_FMT_I42016,
+  if (!vpx_img_alloc(&raw,
+                     enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420
+                                                    : VPX_IMG_FMT_I42016,
                      enc_cfg.g_w, enc_cfg.g_h, 32)) {
     die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
   }
@@ -673,7 +680,7 @@ int main(int argc, const char **argv) {
   }
 #if OUTPUT_RC_STATS
   // For now, just write temporal layer streams.
-  // TODO(wonkap): do spatial by re-writing superframe.
+  // TODO(marpan): do spatial by re-writing superframe.
   if (svc_ctx.output_rc_stat) {
     for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
       char file_name[PATH_MAX];
@@ -691,10 +698,18 @@ int main(int argc, const char **argv) {
 
   if (svc_ctx.speed != -1)
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed);
-  if (svc_ctx.threads)
+  if (svc_ctx.threads) {
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
+    if (svc_ctx.threads > 1)
+      vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1);
+    else
+      vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0);
+  }
   if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1)
     vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+  if (svc_ctx.speed >= 5)
+    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+  vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, 900);
 
   // Encode frames
   while (!end_of_stream) {
@@ -730,7 +745,7 @@ int main(int argc, const char **argv) {
                         &ref_frame_config);
       // Keep track of input frames, to account for frame drops in rate control
       // stats/metrics.
-      for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+      for (sl = 0; sl < (unsigned int)enc_cfg.ss_number_layers; ++sl) {
         ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
                                 layer_id.temporal_layer_id];
       }
@@ -755,18 +770,20 @@ int main(int argc, const char **argv) {
           SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal;
           if (cx_pkt->data.frame.sz > 0) {
 #if OUTPUT_RC_STATS
-            uint32_t sizes[8];
+            uint64_t sizes[8];
             int count = 0;
 #endif
             vpx_video_writer_write_frame(writer, cx_pkt->data.frame.buf,
                                          cx_pkt->data.frame.sz,
                                          cx_pkt->data.frame.pts);
 #if OUTPUT_RC_STATS
-            // TODO(marpan/wonkap): Put this (to line728) in separate function.
+            // TODO(marpan): Put this (to line728) in separate function.
             if (svc_ctx.output_rc_stat) {
               vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id);
               parse_superframe_index(cx_pkt->data.frame.buf,
                                      cx_pkt->data.frame.sz, sizes, &count);
+              if (enc_cfg.ss_number_layers == 1)
+                sizes[0] = cx_pkt->data.frame.sz;
               // Note computing input_layer_frames here won't account for frame
               // drops in rate control stats.
               // TODO(marpan): Fix this for non-bypass mode so we can get stats
@@ -793,7 +810,7 @@ int main(int argc, const char **argv) {
                   rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl];
                   // Keep count of rate control stats per layer, for non-key
                   // frames.
-                  if (tl == layer_id.temporal_layer_id &&
+                  if (tl == (unsigned int)layer_id.temporal_layer_id &&
                       !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
                     rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl];
                     rc.layer_avg_rate_mismatch[layer] +=
@@ -807,7 +824,7 @@ int main(int argc, const char **argv) {
               // Update for short-time encoding bitrate states, for moving
               // window of size rc->window, shifted by rc->window / 2.
               // Ignore first window segment, due to key frame.
-              if (frame_cnt > rc.window_size) {
+              if (frame_cnt > (unsigned int)rc.window_size) {
                 tl = layer_id.temporal_layer_id;
                 for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
                   sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate;
@@ -823,13 +840,14 @@ int main(int argc, const char **argv) {
               }
 
               // Second shifted window.
-              if (frame_cnt > rc.window_size + rc.window_size / 2) {
+              if (frame_cnt >
+                  (unsigned int)(rc.window_size + rc.window_size / 2)) {
                 tl = layer_id.temporal_layer_id;
                 for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
                   sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate;
                 }
 
-                if (frame_cnt > 2 * rc.window_size &&
+                if (frame_cnt > (unsigned int)(2 * rc.window_size) &&
                     frame_cnt % rc.window_size == 0) {
                   rc.window_count += 1;
                   rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
@@ -842,10 +860,11 @@ int main(int argc, const char **argv) {
             }
 #endif
           }
-
+          /*
           printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
                  !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
                  (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
+          */
           if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1)
             si->bytes_sum[0] += (int)cx_pkt->data.frame.sz;
           ++frames_received;
diff --git a/libs/libvpx/examples/vp9cx_set_ref.c b/libs/libvpx/examples/vp9cx_set_ref.c
index 798d7e3f2b..3472689db2 100644
--- a/libs/libvpx/examples/vp9cx_set_ref.c
+++ b/libs/libvpx/examples/vp9cx_set_ref.c
@@ -304,6 +304,7 @@ int main(int argc, char **argv) {
   const char *height_arg = NULL;
   const char *infile_arg = NULL;
   const char *outfile_arg = NULL;
+  const char *update_frame_num_arg = NULL;
   unsigned int limit = 0;
 
   vp9_zero(ecodec);
@@ -318,18 +319,21 @@ int main(int argc, char **argv) {
   height_arg = argv[2];
   infile_arg = argv[3];
   outfile_arg = argv[4];
+  update_frame_num_arg = argv[5];
 
   encoder = get_vpx_encoder_by_name("vp9");
   if (!encoder) die("Unsupported codec.");
 
-  update_frame_num = atoi(argv[5]);
+  update_frame_num = (unsigned int)strtoul(update_frame_num_arg, NULL, 0);
   // In VP9, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are
   // allocated while calling vpx_codec_encode(), thus, setting reference for
   // 1st frame isn't supported.
-  if (update_frame_num <= 1) die("Couldn't parse frame number '%s'\n", argv[5]);
+  if (update_frame_num <= 1) {
+    die("Couldn't parse frame number '%s'\n", update_frame_num_arg);
+  }
 
   if (argc > 6) {
-    limit = atoi(argv[6]);
+    limit = (unsigned int)strtoul(argv[6], NULL, 0);
     if (update_frame_num > limit)
       die("Update frame number couldn't larger than limit\n");
   }
diff --git a/libs/libvpx/examples/vpx_temporal_svc_encoder.c b/libs/libvpx/examples/vpx_temporal_svc_encoder.c
index 309a2fe2e2..f5736ea45d 100644
--- a/libs/libvpx/examples/vpx_temporal_svc_encoder.c
+++ b/libs/libvpx/examples/vpx_temporal_svc_encoder.c
@@ -26,17 +26,27 @@
 #include "../tools_common.h"
 #include "../video_writer.h"
 
+#define VP8_ROI_MAP 0
+
 static const char *exec_name;
 
 void usage_exit(void) { exit(EXIT_FAILURE); }
 
-// Denoiser states, for temporal denoising.
-enum denoiserState {
-  kDenoiserOff,
-  kDenoiserOnYOnly,
-  kDenoiserOnYUV,
-  kDenoiserOnYUVAggressive,
-  kDenoiserOnAdaptive
+// Denoiser states for vp8, for temporal denoising.
+enum denoiserStateVp8 {
+  kVp8DenoiserOff,
+  kVp8DenoiserOnYOnly,
+  kVp8DenoiserOnYUV,
+  kVp8DenoiserOnYUVAggressive,
+  kVp8DenoiserOnAdaptive
+};
+
+// Denoiser states for vp9, for temporal denoising.
+enum denoiserStateVp9 {
+  kVp9DenoiserOff,
+  kVp9DenoiserOnYOnly,
+  // For SVC: denoise the top two spatial layers.
+  kVp9DenoiserOnYTwoSpatialLayers
 };
 
 static int mode_to_num_layers[13] = { 1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 3 };
@@ -154,6 +164,53 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
     die("Error: Number of input frames not equal to output! \n");
 }
 
+#if VP8_ROI_MAP
+static void vp8_set_roi_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi) {
+  unsigned int i, j;
+  memset(roi, 0, sizeof(*roi));
+
+  // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for
+  // segment is 16x16 for vp8, 8x8 for vp9.
+  roi->rows = (cfg->g_h + 15) / 16;
+  roi->cols = (cfg->g_w + 15) / 16;
+
+  // Applies delta QP on the segment blocks, varies from -63 to 63.
+  // Setting to negative means lower QP (better quality).
+  // Below we set delta_q to the extreme (-63) to show strong effect.
+  roi->delta_q[0] = 0;
+  roi->delta_q[1] = -63;
+  roi->delta_q[2] = 0;
+  roi->delta_q[3] = 0;
+
+  // Applies delta loopfilter strength on the segment blocks, varies from -63 to
+  // 63. Setting to positive means stronger loopfilter.
+  roi->delta_lf[0] = 0;
+  roi->delta_lf[1] = 0;
+  roi->delta_lf[2] = 0;
+  roi->delta_lf[3] = 0;
+
+  // Applies skip encoding threshold on the segment blocks, varies from 0 to
+  // UINT_MAX. Larger value means more skipping of encoding is possible.
+  // This skip threshold only applies on delta frames.
+  roi->static_threshold[0] = 0;
+  roi->static_threshold[1] = 0;
+  roi->static_threshold[2] = 0;
+  roi->static_threshold[3] = 0;
+
+  // Use 2 states: 1 is center square, 0 is the rest.
+  roi->roi_map =
+      (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map));
+  for (i = 0; i < roi->rows; ++i) {
+    for (j = 0; j < roi->cols; ++j) {
+      if (i > (roi->rows >> 2) && i < ((roi->rows * 3) >> 2) &&
+          j > (roi->cols >> 2) && j < ((roi->cols * 3) >> 2)) {
+        roi->roi_map[i * roi->cols + j] = 1;
+      }
+    }
+  }
+}
+#endif
+
 // Temporal scaling parameters:
 // NOTE: The 3 prediction frames cannot be used interchangeably due to
 // differences in the way they are handled throughout the code. The
@@ -495,6 +552,7 @@ int main(int argc, char **argv) {
   vpx_codec_err_t res;
   unsigned int width;
   unsigned int height;
+  uint32_t error_resilient = 0;
   int speed;
   int frame_avail;
   int got_data;
@@ -505,16 +563,15 @@ int main(int argc, char **argv) {
   int layering_mode = 0;
   int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 };
   int flag_periodicity = 1;
-#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
-  vpx_svc_layer_id_t layer_id = { 0, 0 };
-#else
-  vpx_svc_layer_id_t layer_id = { 0 };
+#if VP8_ROI_MAP
+  vpx_roi_map_t roi;
 #endif
+  vpx_svc_layer_id_t layer_id = { 0, 0 };
   const VpxInterface *encoder = NULL;
   FILE *infile = NULL;
   struct RateControlMetrics rc;
   int64_t cx_time = 0;
-  const int min_args_base = 11;
+  const int min_args_base = 13;
 #if CONFIG_VP9_HIGHBITDEPTH
   vpx_bit_depth_t bit_depth = VPX_BITS_8;
   int input_bit_depth = 8;
@@ -531,12 +588,14 @@ int main(int argc, char **argv) {
   if (argc < min_args) {
 #if CONFIG_VP9_HIGHBITDEPTH
     die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
-        "<rate_num> <rate_den> <speed> <frame_drop_threshold> <mode> "
+        "<rate_num> <rate_den> <speed> <frame_drop_threshold> "
+        "<error_resilient> <threads> <mode> "
         "<Rate_0> ... <Rate_nlayers-1> <bit-depth> \n",
         argv[0]);
 #else
     die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
-        "<rate_num> <rate_den> <speed> <frame_drop_threshold> <mode> "
+        "<rate_num> <rate_den> <speed> <frame_drop_threshold> "
+        "<error_resilient> <threads> <mode> "
         "<Rate_0> ... <Rate_nlayers-1> \n",
         argv[0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -553,9 +612,9 @@ int main(int argc, char **argv) {
     die("Invalid resolution: %d x %d", width, height);
   }
 
-  layering_mode = (int)strtol(argv[10], NULL, 0);
+  layering_mode = (int)strtol(argv[12], NULL, 0);
   if (layering_mode < 0 || layering_mode > 13) {
-    die("Invalid layering mode (0..12) %s", argv[10]);
+    die("Invalid layering mode (0..12) %s", argv[12]);
   }
 
   if (argc != min_args + mode_to_num_layers[layering_mode]) {
@@ -619,11 +678,11 @@ int main(int argc, char **argv) {
 
   for (i = min_args_base;
        (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) {
-    rc.layer_target_bitrate[i - 11] = (int)strtol(argv[i], NULL, 0);
+    rc.layer_target_bitrate[i - 13] = (int)strtol(argv[i], NULL, 0);
     if (strncmp(encoder->name, "vp8", 3) == 0)
-      cfg.ts_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11];
+      cfg.ts_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13];
     else if (strncmp(encoder->name, "vp9", 3) == 0)
-      cfg.layer_target_bitrate[i - 11] = rc.layer_target_bitrate[i - 11];
+      cfg.layer_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13];
   }
 
   // Real time parameters.
@@ -634,7 +693,7 @@ int main(int argc, char **argv) {
   if (strncmp(encoder->name, "vp9", 3) == 0) cfg.rc_max_quantizer = 52;
   cfg.rc_undershoot_pct = 50;
   cfg.rc_overshoot_pct = 50;
-  cfg.rc_buf_initial_sz = 500;
+  cfg.rc_buf_initial_sz = 600;
   cfg.rc_buf_optimal_sz = 600;
   cfg.rc_buf_sz = 1000;
 
@@ -642,10 +701,14 @@ int main(int argc, char **argv) {
   cfg.rc_resize_allowed = 0;
 
   // Use 1 thread as default.
-  cfg.g_threads = 1;
+  cfg.g_threads = (unsigned int)strtoul(argv[11], NULL, 0);
 
+  error_resilient = (uint32_t)strtoul(argv[10], NULL, 0);
+  if (error_resilient != 0 && error_resilient != 1) {
+    die("Invalid value for error resilient (0, 1): %d.", error_resilient);
+  }
   // Enable error resilient mode.
-  cfg.g_error_resilient = 1;
+  cfg.g_error_resilient = error_resilient;
   cfg.g_lag_in_frames = 0;
   cfg.kf_mode = VPX_KF_AUTO;
 
@@ -700,18 +763,33 @@ int main(int argc, char **argv) {
 
   if (strncmp(encoder->name, "vp8", 3) == 0) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
-    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff);
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+    vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0);
+#if VP8_ROI_MAP
+    vp8_set_roi_map(&cfg, &roi);
+    if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
+      die_codec(&codec, "Failed to set ROI map");
+#endif
+
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
     vpx_svc_extra_cfg_t svc_params;
     memset(&svc_params, 0, sizeof(svc_params));
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
     vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+    vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0);
+    vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0);
     vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
-    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kVp9DenoiserOff);
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
     vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
+    // TODO(marpan/jianj): There is an issue with row-mt for low resolutons at
+    // high speed settings, disable its use for those cases for now.
+    if (cfg.g_threads > 1 && ((cfg.g_w > 320 && cfg.g_h > 240) || speed < 7))
+      vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1);
+    else
+      vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0);
     if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1 : 0))
       die_codec(&codec, "Failed to set SVC");
     for (i = 0; i < cfg.ts_number_layers; ++i) {
@@ -730,7 +808,7 @@ int main(int argc, char **argv) {
   // For generating smaller key frames, use a smaller max_intra_size_pct
   // value, like 100 or 200.
   {
-    const int max_intra_size_pct = 900;
+    const int max_intra_size_pct = 1000;
     vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
                       max_intra_size_pct);
   }
@@ -740,10 +818,8 @@ int main(int argc, char **argv) {
     struct vpx_usec_timer timer;
     vpx_codec_iter_t iter = NULL;
     const vpx_codec_cx_pkt_t *pkt;
-#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
     // Update the temporal layer_id. No spatial layers in this test.
     layer_id.spatial_layer_id = 0;
-#endif
     layer_id.temporal_layer_id =
         cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
     if (strncmp(encoder->name, "vp9", 3) == 0) {
diff --git a/libs/libvpx/libs.mk b/libs/libvpx/libs.mk
index 97ed93bb70..a3e2f9d0eb 100644
--- a/libs/libvpx/libs.mk
+++ b/libs/libvpx/libs.mk
@@ -12,7 +12,7 @@
 # ARM assembly files are written in RVCT-style. We use some make magic to
 # filter those files to allow GCC compilation
 ifeq ($(ARCH_ARM),yes)
-  ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.s,.asm)
+  ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.S,.asm)
 else
   ASM:=.asm
 endif
@@ -106,9 +106,6 @@ ifeq ($(CONFIG_VP9_DECODER),yes)
   CODEC_DOC_SECTIONS += vp9 vp9_decoder
 endif
 
-VP9_PREFIX=vp9/
-$(BUILD_PFX)$(VP9_PREFIX)%.c.o:
-
 ifeq ($(CONFIG_ENCODERS),yes)
   CODEC_DOC_SECTIONS += encoder
 endif
@@ -116,6 +113,12 @@ ifeq ($(CONFIG_DECODERS),yes)
   CODEC_DOC_SECTIONS += decoder
 endif
 
+# Suppress -Wextra warnings in third party code.
+$(BUILD_PFX)third_party/googletest/%.cc.o: CXXFLAGS += -Wno-missing-field-initializers
+# Suppress -Wextra warnings in first party code pending investigation.
+# https://bugs.chromium.org/p/webm/issues/detail?id=1069
+$(BUILD_PFX)vp8/encoder/onyx_if.c.o: CFLAGS += -Wno-unknown-warning-option -Wno-clobbered
+$(BUILD_PFX)vp8/decoder/onyxd_if.c.o: CFLAGS += -Wno-unknown-warning-option -Wno-clobbered
 
 ifeq ($(CONFIG_MSVS),yes)
 CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd)
@@ -146,6 +149,7 @@ CODEC_SRCS-yes += $(BUILD_PFX)vpx_config.c
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
 ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += vpx_dsp/x86/bitdepth_conversion_sse2.asm
 endif
 CODEC_EXPORTS-yes += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
@@ -184,6 +188,13 @@ libvpx_srcs.txt:
 	@echo $(CODEC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
 CLEAN-OBJS += libvpx_srcs.txt
 
+# Assembly files that are included, but don't define symbols themselves.
+# Filtered out to avoid Windows build warnings.
+ASM_INCLUDES := \
+    third_party/x86inc/x86inc.asm \
+    vpx_config.asm \
+    vpx_ports/x86_abi_support.asm \
+    vpx_dsp/x86/bitdepth_conversion_sse2.asm \
 
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
@@ -195,13 +206,6 @@ vpx.def: $(call enabled,CODEC_EXPORTS)
             --out=$@ $^
 CLEAN-OBJS += vpx.def
 
-# Assembly files that are included, but don't define symbols themselves.
-# Filtered out to avoid Visual Studio build warnings.
-ASM_INCLUDES := \
-    third_party/x86inc/x86inc.asm \
-    vpx_config.asm \
-    vpx_ports/x86_abi_support.asm \
-
 vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
@@ -224,12 +228,12 @@ vpx.$(VCPROJ_SFX): $(RTCD)
 
 endif
 else
-LIBVPX_OBJS=$(call objs,$(CODEC_SRCS))
+LIBVPX_OBJS=$(call objs, $(filter-out $(ASM_INCLUDES), $(CODEC_SRCS)))
 OBJS-yes += $(LIBVPX_OBJS)
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 
-SO_VERSION_MAJOR := 4
+SO_VERSION_MAJOR := 5
 SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
@@ -363,7 +367,7 @@ endif
 #
 # Add assembler dependencies for configuration.
 #
-$(filter %.s.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
+$(filter %.S.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
 $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
 
 
@@ -388,7 +392,7 @@ LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS))
 LIBVPX_TEST_BIN=./test_libvpx$(EXE_SFX)
 LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
                      $(call enabled,LIBVPX_TEST_DATA))
-libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)
+libvpx_test_data_url=https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/$(1)
 
 TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX)
 TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
@@ -401,8 +405,16 @@ CLEAN-OBJS += libvpx_test_srcs.txt
 
 $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
 	@echo "    [DOWNLOAD] $@"
-	$(qexec)trap 'rm -f $@' INT TERM &&\
-            curl -L -o $@ $(call libvpx_test_data_url,$(@F))
+	# Attempt to download the file using curl, retrying once if it fails for a
+	# partial file (18).
+	$(qexec)( \
+	  trap 'rm -f $@' INT TERM; \
+	  curl="curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))"; \
+	  $$curl; \
+	  case "$$?" in \
+	    18) $$curl -C -;; \
+	  esac \
+	)
 
 testdata:: $(LIBVPX_TEST_DATA)
 	$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
diff --git a/libs/libvpx/rate_hist.c b/libs/libvpx/rate_hist.c
index 872a10bae0..6cf8ce7bb0 100644
--- a/libs/libvpx/rate_hist.c
+++ b/libs/libvpx/rate_hist.c
@@ -37,7 +37,13 @@ struct rate_hist {
 struct rate_hist *init_rate_histogram(const vpx_codec_enc_cfg_t *cfg,
                                       const vpx_rational_t *fps) {
   int i;
-  struct rate_hist *hist = malloc(sizeof(*hist));
+  struct rate_hist *hist = calloc(1, sizeof(*hist));
+
+  if (hist == NULL || cfg == NULL || fps == NULL || fps->num == 0 ||
+      fps->den == 0) {
+    destroy_rate_histogram(hist);
+    return NULL;
+  }
 
   // Determine the number of samples in the buffer. Use the file's framerate
   // to determine the number of frames in rc_buf_sz milliseconds, with an
@@ -80,7 +86,11 @@ void update_rate_histogram(struct rate_hist *hist,
                       (uint64_t)cfg->g_timebase.num /
                       (uint64_t)cfg->g_timebase.den;
 
-  int idx = hist->frames++ % hist->samples;
+  int idx;
+
+  if (hist == NULL || cfg == NULL || pkt == NULL) return;
+
+  idx = hist->frames++ % hist->samples;
   hist->pts[idx] = now;
   hist->sz[idx] = (int)pkt->data.frame.sz;
 
@@ -116,9 +126,14 @@ void update_rate_histogram(struct rate_hist *hist,
 static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets,
                               int *num_buckets) {
   int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0;
-  int buckets = *num_buckets;
+  int buckets;
   int i;
 
+  assert(bucket != NULL);
+  assert(num_buckets != NULL);
+
+  buckets = *num_buckets;
+
   /* Find the extrema for this list of buckets */
   big_bucket = small_bucket = 0;
   for (i = 0; i < buckets; i++) {
@@ -181,6 +196,8 @@ static void show_histogram(const struct hist_bucket *bucket, int buckets,
   const char *pat1, *pat2;
   int i;
 
+  assert(bucket != NULL);
+
   switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) {
     case 1:
     case 2:
@@ -259,6 +276,8 @@ void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg,
   int i, scale;
   int buckets = 0;
 
+  if (hist == NULL || cfg == NULL) return;
+
   for (i = 0; i < RATE_BINS; i++) {
     if (hist->bucket[i].low == INT_MAX) continue;
     hist->bucket[buckets++] = hist->bucket[i];
diff --git a/libs/libvpx/test/acm_random.h b/libs/libvpx/test/acm_random.h
index c2f6b0e410..d915cf9133 100644
--- a/libs/libvpx/test/acm_random.h
+++ b/libs/libvpx/test/acm_random.h
@@ -11,6 +11,10 @@
 #ifndef TEST_ACM_RANDOM_H_
 #define TEST_ACM_RANDOM_H_
 
+#include <assert.h>
+
+#include <limits>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "vpx/vpx_integer.h"
@@ -50,6 +54,13 @@ class ACMRandom {
     return r < 128 ? r << 4 : r >> 4;
   }
 
+  uint32_t RandRange(const uint32_t range) {
+    // testing::internal::Random::Generate provides values in the range
+    // testing::internal::Random::kMaxRange.
+    assert(range <= testing::internal::Random::kMaxRange);
+    return random_.Generate(range);
+  }
+
   int PseudoUniform(int range) { return random_.Generate(range); }
 
   int operator()(int n) { return PseudoUniform(n); }
diff --git a/libs/libvpx/test/android/Android.mk b/libs/libvpx/test/android/Android.mk
index 48872a2b65..7318de2fc4 100644
--- a/libs/libvpx/test/android/Android.mk
+++ b/libs/libvpx/test/android/Android.mk
@@ -32,6 +32,7 @@ LOCAL_CPP_EXTENSION := .cc
 LOCAL_MODULE := gtest
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/
+LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/include/
 LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc
 include $(BUILD_STATIC_LIBRARY)
 
diff --git a/libs/libvpx/test/avg_test.cc b/libs/libvpx/test/avg_test.cc
index 867a77aa0d..ad21198e4b 100644
--- a/libs/libvpx/test/avg_test.cc
+++ b/libs/libvpx/test/avg_test.cc
@@ -14,6 +14,7 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
@@ -22,6 +23,7 @@
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
@@ -53,7 +55,7 @@ class AverageTestBase : public ::testing::Test {
   }
 
   // Sum Pixels
-  unsigned int ReferenceAverage8x8(const uint8_t *source, int pitch) {
+  static unsigned int ReferenceAverage8x8(const uint8_t *source, int pitch) {
     unsigned int average = 0;
     for (int h = 0; h < 8; ++h) {
       for (int w = 0; w < 8; ++w) average += source[h * pitch + w];
@@ -61,7 +63,7 @@ class AverageTestBase : public ::testing::Test {
     return ((average + 32) >> 6);
   }
 
-  unsigned int ReferenceAverage4x4(const uint8_t *source, int pitch) {
+  static unsigned int ReferenceAverage4x4(const uint8_t *source, int pitch) {
     unsigned int average = 0;
     for (int h = 0; h < 4; ++h) {
       for (int w = 0; w < 4; ++w) average += source[h * pitch + w];
@@ -98,11 +100,12 @@ class AverageTest : public AverageTestBase,
 
  protected:
   void CheckAverages() {
+    const int block_size = GET_PARAM(3);
     unsigned int expected = 0;
-    if (GET_PARAM(3) == 8) {
+    if (block_size == 8) {
       expected =
           ReferenceAverage8x8(source_data_ + GET_PARAM(2), source_stride_);
-    } else if (GET_PARAM(3) == 4) {
+    } else if (block_size == 4) {
       expected =
           ReferenceAverage4x4(source_data_ + GET_PARAM(2), source_stride_);
     }
@@ -185,7 +188,7 @@ class IntProColTest : public AverageTestBase,
   int16_t sum_c_;
 };
 
-typedef int (*SatdFunc)(const int16_t *coeffs, int length);
+typedef int (*SatdFunc)(const tran_low_t *coeffs, int length);
 typedef std::tr1::tuple<int, SatdFunc> SatdTestParam;
 
 class SatdTest : public ::testing::Test,
@@ -195,7 +198,7 @@ class SatdTest : public ::testing::Test,
     satd_size_ = GET_PARAM(0);
     satd_func_ = GET_PARAM(1);
     rnd_.Reset(ACMRandom::DeterministicSeed());
-    src_ = reinterpret_cast<int16_t *>(
+    src_ = reinterpret_cast<tran_low_t *>(
         vpx_memalign(16, sizeof(*src_) * satd_size_));
     ASSERT_TRUE(src_ != NULL);
   }
@@ -205,12 +208,15 @@ class SatdTest : public ::testing::Test,
     vpx_free(src_);
   }
 
-  void FillConstant(const int16_t val) {
+  void FillConstant(const tran_low_t val) {
     for (int i = 0; i < satd_size_; ++i) src_[i] = val;
   }
 
   void FillRandom() {
-    for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16();
+    for (int i = 0; i < satd_size_; ++i) {
+      const int16_t tmp = rnd_.Rand16();
+      src_[i] = (tran_low_t)tmp;
+    }
   }
 
   void Check(const int expected) {
@@ -222,11 +228,66 @@ class SatdTest : public ::testing::Test,
   int satd_size_;
 
  private:
-  int16_t *src_;
+  tran_low_t *src_;
   SatdFunc satd_func_;
   ACMRandom rnd_;
 };
 
+typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,
+                                  const tran_low_t *dqcoeff, int block_size);
+typedef std::tr1::tuple<int, BlockErrorFunc> BlockErrorTestFPParam;
+
+class BlockErrorTestFP
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<BlockErrorTestFPParam> {
+ protected:
+  virtual void SetUp() {
+    txfm_size_ = GET_PARAM(0);
+    block_error_func_ = GET_PARAM(1);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    coeff_ = reinterpret_cast<tran_low_t *>(
+        vpx_memalign(16, sizeof(*coeff_) * txfm_size_));
+    dqcoeff_ = reinterpret_cast<tran_low_t *>(
+        vpx_memalign(16, sizeof(*dqcoeff_) * txfm_size_));
+    ASSERT_TRUE(coeff_ != NULL);
+    ASSERT_TRUE(dqcoeff_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+    vpx_free(coeff_);
+    vpx_free(dqcoeff_);
+  }
+
+  void FillConstant(const tran_low_t coeff_val, const tran_low_t dqcoeff_val) {
+    for (int i = 0; i < txfm_size_; ++i) coeff_[i] = coeff_val;
+    for (int i = 0; i < txfm_size_; ++i) dqcoeff_[i] = dqcoeff_val;
+  }
+
+  void FillRandom() {
+    // Just two fixed seeds
+    rnd_.Reset(0xb0b9);
+    for (int i = 0; i < txfm_size_; ++i) coeff_[i] = rnd_.Rand16() >> 1;
+    rnd_.Reset(0xb0c8);
+    for (int i = 0; i < txfm_size_; ++i) dqcoeff_[i] = rnd_.Rand16() >> 1;
+  }
+
+  void Check(const int64_t expected) {
+    int64_t total;
+    ASM_REGISTER_STATE_CHECK(
+        total = block_error_func_(coeff_, dqcoeff_, txfm_size_));
+    EXPECT_EQ(expected, total);
+  }
+
+  int txfm_size_;
+
+ private:
+  tran_low_t *coeff_;
+  tran_low_t *dqcoeff_;
+  BlockErrorFunc block_error_func_;
+  ACMRandom rnd_;
+};
+
 uint8_t *AverageTestBase::source_data_ = NULL;
 
 TEST_P(AverageTest, MinValue) {
@@ -307,6 +368,66 @@ TEST_P(SatdTest, Random) {
   Check(expected);
 }
 
+TEST_P(SatdTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 20000;
+  vpx_usec_timer timer;
+  DECLARE_ALIGNED(16, tran_low_t, coeff[1024]);
+  const int blocksize = GET_PARAM(0);
+
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    GET_PARAM(1)(coeff, blocksize);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+
+TEST_P(BlockErrorTestFP, MinValue) {
+  const int64_t kMin = -32640;
+  const int64_t expected = kMin * kMin * txfm_size_;
+  FillConstant(kMin, 0);
+  Check(expected);
+}
+
+TEST_P(BlockErrorTestFP, MaxValue) {
+  const int64_t kMax = 32640;
+  const int64_t expected = kMax * kMax * txfm_size_;
+  FillConstant(kMax, 0);
+  Check(expected);
+}
+
+TEST_P(BlockErrorTestFP, Random) {
+  int64_t expected;
+  switch (txfm_size_) {
+    case 16: expected = 2051681432; break;
+    case 64: expected = 11075114379; break;
+    case 256: expected = 44386271116; break;
+    case 1024: expected = 184774996089; break;
+    default:
+      FAIL() << "Invalid satd size (" << txfm_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+
+TEST_P(BlockErrorTestFP, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 20000;
+  vpx_usec_timer timer;
+  DECLARE_ALIGNED(16, tran_low_t, coeff[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[1024]);
+  const int blocksize = GET_PARAM(0);
+
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    GET_PARAM(1)(coeff, dqcoeff, blocksize);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+
 using std::tr1::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(
@@ -320,6 +441,13 @@ INSTANTIATE_TEST_CASE_P(C, SatdTest,
                                           make_tuple(256, &vpx_satd_c),
                                           make_tuple(1024, &vpx_satd_c)));
 
+INSTANTIATE_TEST_CASE_P(
+    C, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_c),
+                      make_tuple(64, &vp9_block_error_fp_c),
+                      make_tuple(256, &vp9_block_error_fp_c),
+                      make_tuple(1024, &vp9_block_error_fp_c)));
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, AverageTest,
@@ -349,6 +477,28 @@ INSTANTIATE_TEST_CASE_P(SSE2, SatdTest,
                                           make_tuple(64, &vpx_satd_sse2),
                                           make_tuple(256, &vpx_satd_sse2),
                                           make_tuple(1024, &vpx_satd_sse2)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_sse2),
+                      make_tuple(64, &vp9_block_error_fp_sse2),
+                      make_tuple(256, &vp9_block_error_fp_sse2),
+                      make_tuple(1024, &vp9_block_error_fp_sse2)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, SatdTest,
+                        ::testing::Values(make_tuple(16, &vpx_satd_avx2),
+                                          make_tuple(64, &vpx_satd_avx2),
+                                          make_tuple(256, &vpx_satd_avx2),
+                                          make_tuple(1024, &vpx_satd_avx2)));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2),
+                      make_tuple(64, &vp9_block_error_fp_avx2),
+                      make_tuple(256, &vp9_block_error_fp_avx2),
+                      make_tuple(1024, &vp9_block_error_fp_avx2)));
 #endif
 
 #if HAVE_NEON
@@ -380,7 +530,18 @@ INSTANTIATE_TEST_CASE_P(NEON, SatdTest,
                                           make_tuple(64, &vpx_satd_neon),
                                           make_tuple(256, &vpx_satd_neon),
                                           make_tuple(1024, &vpx_satd_neon)));
-#endif
+
+// TODO(jianj): Remove the highbitdepth flag once the SIMD functions are
+// in place.
+#if !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    NEON, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon),
+                      make_tuple(64, &vp9_block_error_fp_neon),
+                      make_tuple(256, &vp9_block_error_fp_neon),
+                      make_tuple(1024, &vp9_block_error_fp_neon)));
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON
 
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(
@@ -391,6 +552,30 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(16, 16, 0, 4, &vpx_avg_4x4_msa),
                       make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa),
                       make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa)));
-#endif
+
+INSTANTIATE_TEST_CASE_P(
+    MSA, IntProRowTest,
+    ::testing::Values(make_tuple(16, &vpx_int_pro_row_msa, &vpx_int_pro_row_c),
+                      make_tuple(32, &vpx_int_pro_row_msa, &vpx_int_pro_row_c),
+                      make_tuple(64, &vpx_int_pro_row_msa,
+                                 &vpx_int_pro_row_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    MSA, IntProColTest,
+    ::testing::Values(make_tuple(16, &vpx_int_pro_col_msa, &vpx_int_pro_col_c),
+                      make_tuple(32, &vpx_int_pro_col_msa, &vpx_int_pro_col_c),
+                      make_tuple(64, &vpx_int_pro_col_msa,
+                                 &vpx_int_pro_col_c)));
+
+// TODO(jingning): Remove the highbitdepth flag once the SIMD functions are
+// in place.
+#if !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(MSA, SatdTest,
+                        ::testing::Values(make_tuple(16, &vpx_satd_msa),
+                                          make_tuple(64, &vpx_satd_msa),
+                                          make_tuple(256, &vpx_satd_msa),
+                                          make_tuple(1024, &vpx_satd_msa)));
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_MSA
 
 }  // namespace
diff --git a/libs/libvpx/test/buffer.h b/libs/libvpx/test/buffer.h
new file mode 100644
index 0000000000..2175dad9d9
--- /dev/null
+++ b/libs/libvpx/test/buffer.h
@@ -0,0 +1,382 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_BUFFER_H_
+#define TEST_BUFFER_H_
+
+#include <stdio.h>
+
+#include <limits>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace libvpx_test {
+
+template <typename T>
+class Buffer {
+ public:
+  Buffer(int width, int height, int top_padding, int left_padding,
+         int right_padding, int bottom_padding)
+      : width_(width), height_(height), top_padding_(top_padding),
+        left_padding_(left_padding), right_padding_(right_padding),
+        bottom_padding_(bottom_padding), alignment_(0), padding_value_(0),
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+
+  Buffer(int width, int height, int top_padding, int left_padding,
+         int right_padding, int bottom_padding, unsigned int alignment)
+      : width_(width), height_(height), top_padding_(top_padding),
+        left_padding_(left_padding), right_padding_(right_padding),
+        bottom_padding_(bottom_padding), alignment_(alignment),
+        padding_value_(0), stride_(0), raw_size_(0), num_elements_(0),
+        raw_buffer_(NULL) {}
+
+  Buffer(int width, int height, int padding)
+      : width_(width), height_(height), top_padding_(padding),
+        left_padding_(padding), right_padding_(padding),
+        bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0),
+        raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+
+  Buffer(int width, int height, int padding, unsigned int alignment)
+      : width_(width), height_(height), top_padding_(padding),
+        left_padding_(padding), right_padding_(padding),
+        bottom_padding_(padding), alignment_(alignment), padding_value_(0),
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+
+  ~Buffer() {
+    if (alignment_) {
+      vpx_free(raw_buffer_);
+    } else {
+      delete[] raw_buffer_;
+    }
+  }
+
+  T *TopLeftPixel() const;
+
+  int stride() const { return stride_; }
+
+  // Set the buffer (excluding padding) to 'value'.
+  void Set(const T value);
+
+  // Set the buffer (excluding padding) to the output of ACMRandom function
+  // 'rand_func'.
+  void Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)());
+
+  // Set the buffer (excluding padding) to the output of ACMRandom function
+  // 'RandRange' with range 'low' to 'high' which typically must be within
+  // testing::internal::Random::kMaxRange (1u << 31). However, because we want
+  // to allow negative low (and high) values, it is restricted to INT32_MAX
+  // here.
+  void Set(ACMRandom *rand_class, const T low, const T high);
+
+  // Copy the contents of Buffer 'a' (excluding padding).
+  void CopyFrom(const Buffer<T> &a);
+
+  void DumpBuffer() const;
+
+  // Highlight the differences between two buffers if they are the same size.
+  void PrintDifference(const Buffer<T> &a) const;
+
+  bool HasPadding() const;
+
+  // Sets all the values in the buffer to 'padding_value'.
+  void SetPadding(const T padding_value);
+
+  // Checks if all the values (excluding padding) are equal to 'value' if the
+  // Buffers are the same size.
+  bool CheckValues(const T value) const;
+
+  // Check that padding matches the expected value or there is no padding.
+  bool CheckPadding() const;
+
+  // Compare the non-padding portion of two buffers if they are the same size.
+  bool CheckValues(const Buffer<T> &a) const;
+
+  bool Init() {
+    if (raw_buffer_ != NULL) return false;
+    EXPECT_GT(width_, 0);
+    EXPECT_GT(height_, 0);
+    EXPECT_GE(top_padding_, 0);
+    EXPECT_GE(left_padding_, 0);
+    EXPECT_GE(right_padding_, 0);
+    EXPECT_GE(bottom_padding_, 0);
+    stride_ = left_padding_ + width_ + right_padding_;
+    num_elements_ = stride_ * (top_padding_ + height_ + bottom_padding_);
+    raw_size_ = num_elements_ * sizeof(T);
+    if (alignment_) {
+      EXPECT_GE(alignment_, sizeof(T));
+      // Ensure alignment of the first value will be preserved.
+      EXPECT_EQ((left_padding_ * sizeof(T)) % alignment_, 0u);
+      // Ensure alignment of the subsequent rows will be preserved when there is
+      // a stride.
+      if (stride_ != width_) {
+        EXPECT_EQ((stride_ * sizeof(T)) % alignment_, 0u);
+      }
+      raw_buffer_ = reinterpret_cast<T *>(vpx_memalign(alignment_, raw_size_));
+    } else {
+      raw_buffer_ = new (std::nothrow) T[num_elements_];
+    }
+    EXPECT_TRUE(raw_buffer_ != NULL);
+    SetPadding(std::numeric_limits<T>::max());
+    return !::testing::Test::HasFailure();
+  }
+
+ private:
+  bool BufferSizesMatch(const Buffer<T> &a) const;
+
+  const int width_;
+  const int height_;
+  const int top_padding_;
+  const int left_padding_;
+  const int right_padding_;
+  const int bottom_padding_;
+  const unsigned int alignment_;
+  T padding_value_;
+  int stride_;
+  int raw_size_;
+  int num_elements_;
+  T *raw_buffer_;
+};
+
+template <typename T>
+T *Buffer<T>::TopLeftPixel() const {
+  if (!raw_buffer_) return NULL;
+  return raw_buffer_ + (top_padding_ * stride_) + left_padding_;
+}
+
+template <typename T>
+void Buffer<T>::Set(const T value) {
+  if (!raw_buffer_) return;
+  T *src = TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      src[width] = value;
+    }
+    src += stride_;
+  }
+}
+
+template <typename T>
+void Buffer<T>::Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()) {
+  if (!raw_buffer_) return;
+  T *src = TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      src[width] = (*rand_class.*rand_func)();
+    }
+    src += stride_;
+  }
+}
+
+template <typename T>
+void Buffer<T>::Set(ACMRandom *rand_class, const T low, const T high) {
+  if (!raw_buffer_) return;
+
+  EXPECT_LE(low, high);
+  EXPECT_LE(static_cast<int64_t>(high) - low,
+            std::numeric_limits<int32_t>::max());
+
+  T *src = TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      // 'low' will be promoted to unsigned given the return type of RandRange.
+      // Store the value as an int to avoid unsigned overflow warnings when
+      // 'low' is negative.
+      const int32_t value =
+          static_cast<int32_t>((*rand_class).RandRange(high - low));
+      src[width] = static_cast<T>(value + low);
+    }
+    src += stride_;
+  }
+}
+
+template <typename T>
+void Buffer<T>::CopyFrom(const Buffer<T> &a) {
+  if (!raw_buffer_) return;
+  if (!BufferSizesMatch(a)) return;
+
+  T *a_src = a.TopLeftPixel();
+  T *b_src = this->TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      b_src[width] = a_src[width];
+    }
+    a_src += a.stride();
+    b_src += this->stride();
+  }
+}
+
+template <typename T>
+void Buffer<T>::DumpBuffer() const {
+  if (!raw_buffer_) return;
+  for (int height = 0; height < height_ + top_padding_ + bottom_padding_;
+       ++height) {
+    for (int width = 0; width < stride_; ++width) {
+      printf("%4d", raw_buffer_[height + width * stride_]);
+    }
+    printf("\n");
+  }
+}
+
+template <typename T>
+bool Buffer<T>::HasPadding() const {
+  if (!raw_buffer_) return false;
+  return top_padding_ || left_padding_ || right_padding_ || bottom_padding_;
+}
+
+template <typename T>
+void Buffer<T>::PrintDifference(const Buffer<T> &a) const {
+  if (!raw_buffer_) return;
+  if (!BufferSizesMatch(a)) return;
+
+  T *a_src = a.TopLeftPixel();
+  T *b_src = TopLeftPixel();
+
+  printf("This buffer:\n");
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      if (a_src[width] != b_src[width]) {
+        printf("*%3d", b_src[width]);
+      } else {
+        printf("%4d", b_src[width]);
+      }
+    }
+    printf("\n");
+    a_src += a.stride();
+    b_src += this->stride();
+  }
+
+  a_src = a.TopLeftPixel();
+  b_src = TopLeftPixel();
+
+  printf("Reference buffer:\n");
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      if (a_src[width] != b_src[width]) {
+        printf("*%3d", a_src[width]);
+      } else {
+        printf("%4d", a_src[width]);
+      }
+    }
+    printf("\n");
+    a_src += a.stride();
+    b_src += this->stride();
+  }
+}
+
+template <typename T>
+void Buffer<T>::SetPadding(const T padding_value) {
+  if (!raw_buffer_) return;
+  padding_value_ = padding_value;
+
+  T *src = raw_buffer_;
+  for (int i = 0; i < num_elements_; ++i) {
+    src[i] = padding_value;
+  }
+}
+
+template <typename T>
+bool Buffer<T>::CheckValues(const T value) const {
+  if (!raw_buffer_) return false;
+  T *src = TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      if (value != src[width]) {
+        return false;
+      }
+    }
+    src += stride_;
+  }
+  return true;
+}
+
+template <typename T>
+bool Buffer<T>::CheckPadding() const {
+  if (!raw_buffer_) return false;
+  if (!HasPadding()) return true;
+
+  // Top padding.
+  T const *top = raw_buffer_;
+  for (int i = 0; i < stride_ * top_padding_; ++i) {
+    if (padding_value_ != top[i]) {
+      return false;
+    }
+  }
+
+  // Left padding.
+  T const *left = TopLeftPixel() - left_padding_;
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < left_padding_; ++width) {
+      if (padding_value_ != left[width]) {
+        return false;
+      }
+    }
+    left += stride_;
+  }
+
+  // Right padding.
+  T const *right = TopLeftPixel() + width_;
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < right_padding_; ++width) {
+      if (padding_value_ != right[width]) {
+        return false;
+      }
+    }
+    right += stride_;
+  }
+
+  // Bottom padding
+  T const *bottom = raw_buffer_ + (top_padding_ + height_) * stride_;
+  for (int i = 0; i < stride_ * bottom_padding_; ++i) {
+    if (padding_value_ != bottom[i]) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template <typename T>
+bool Buffer<T>::CheckValues(const Buffer<T> &a) const {
+  if (!raw_buffer_) return false;
+  if (!BufferSizesMatch(a)) return false;
+
+  T *a_src = a.TopLeftPixel();
+  T *b_src = this->TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      if (a_src[width] != b_src[width]) {
+        return false;
+      }
+    }
+    a_src += a.stride();
+    b_src += this->stride();
+  }
+  return true;
+}
+
+template <typename T>
+bool Buffer<T>::BufferSizesMatch(const Buffer<T> &a) const {
+  if (!raw_buffer_) return false;
+  if (a.width_ != this->width_ || a.height_ != this->height_) {
+    printf(
+        "Reference buffer of size %dx%d does not match this buffer which is "
+        "size %dx%d\n",
+        a.width_, a.height_, this->width_, this->height_);
+    return false;
+  }
+
+  return true;
+}
+}  // namespace libvpx_test
+#endif  // TEST_BUFFER_H_
diff --git a/libs/libvpx/test/byte_alignment_test.cc b/libs/libvpx/test/byte_alignment_test.cc
index d78294d10f..5a058b2756 100644
--- a/libs/libvpx/test/byte_alignment_test.cc
+++ b/libs/libvpx/test/byte_alignment_test.cc
@@ -128,8 +128,8 @@ class ByteAlignmentTest
   // TODO(fgalligan): Move the MD5 testing code into another class.
   void OpenMd5File(const std::string &md5_file_name_) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: "
-                                   << md5_file_name_;
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "MD5 file open failed. Filename: " << md5_file_name_;
   }
 
   void CheckMd5(const vpx_image_t &img) {
diff --git a/libs/libvpx/test/codec_factory.h b/libs/libvpx/test/codec_factory.h
index e867dacafe..d5882ed9c8 100644
--- a/libs/libvpx/test/codec_factory.h
+++ b/libs/libvpx/test/codec_factory.h
@@ -65,6 +65,12 @@ class CodecTestWith3Params
     : public ::testing::TestWithParam<
           std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2, T3> > {};
 
+template <class T1, class T2, class T3, class T4>
+class CodecTestWith4Params
+    : public ::testing::TestWithParam<
+          std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2, T3, T4> > {
+};
+
 /*
  * VP8 Codec Definitions
  */
@@ -115,6 +121,8 @@ class VP8CodecFactory : public CodecFactory {
 #if CONFIG_VP8_DECODER
     return new VP8Decoder(cfg, flags);
 #else
+    (void)cfg;
+    (void)flags;
     return NULL;
 #endif
   }
@@ -126,6 +134,10 @@ class VP8CodecFactory : public CodecFactory {
 #if CONFIG_VP8_ENCODER
     return new VP8Encoder(cfg, deadline, init_flags, stats);
 #else
+    (void)cfg;
+    (void)deadline;
+    (void)init_flags;
+    (void)stats;
     return NULL;
 #endif
   }
@@ -135,6 +147,8 @@ class VP8CodecFactory : public CodecFactory {
 #if CONFIG_VP8_ENCODER
     return vpx_codec_enc_config_default(&vpx_codec_vp8_cx_algo, cfg, usage);
 #else
+    (void)cfg;
+    (void)usage;
     return VPX_CODEC_INCAPABLE;
 #endif
   }
@@ -203,6 +217,8 @@ class VP9CodecFactory : public CodecFactory {
 #if CONFIG_VP9_DECODER
     return new VP9Decoder(cfg, flags);
 #else
+    (void)cfg;
+    (void)flags;
     return NULL;
 #endif
   }
@@ -214,6 +230,10 @@ class VP9CodecFactory : public CodecFactory {
 #if CONFIG_VP9_ENCODER
     return new VP9Encoder(cfg, deadline, init_flags, stats);
 #else
+    (void)cfg;
+    (void)deadline;
+    (void)init_flags;
+    (void)stats;
     return NULL;
 #endif
   }
@@ -223,6 +243,8 @@ class VP9CodecFactory : public CodecFactory {
 #if CONFIG_VP9_ENCODER
     return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage);
 #else
+    (void)cfg;
+    (void)usage;
     return VPX_CODEC_INCAPABLE;
 #endif
   }
diff --git a/libs/libvpx/test/comp_avg_pred_test.cc b/libs/libvpx/test/comp_avg_pred_test.cc
new file mode 100644
index 0000000000..110e065836
--- /dev/null
+++ b/libs/libvpx/test/comp_avg_pred_test.cc
@@ -0,0 +1,182 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/register_state_check.h"
+#include "vpx_ports/vpx_timer.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+using ::libvpx_test::Buffer;
+
+typedef void (*AvgPredFunc)(uint8_t *a, const uint8_t *b, int w, int h,
+                            const uint8_t *c, int c_stride);
+
+uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; }
+
+void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref,
+                    int width, int height, Buffer<uint8_t> *avg) {
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      avg->TopLeftPixel()[y * avg->stride() + x] =
+          avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x],
+                            ref.TopLeftPixel()[y * ref.stride() + x]);
+    }
+  }
+}
+
+class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> {
+ public:
+  virtual void SetUp() {
+    avg_pred_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  AvgPredFunc avg_pred_func_;
+  ACMRandom rnd_;
+};
+
+TEST_P(AvgPredTest, SizeCombinations) {
+  // This is called as part of the sub pixel variance. As such it must be one of
+  // the variance block sizes.
+
+  for (int width_pow = 2; width_pow <= 6; ++width_pow) {
+    for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
+         ++height_pow) {
+      // Don't test 4x2 or 64x128
+      if (height_pow == 1 || height_pow == 7) continue;
+
+      // The sse2 special-cases when ref width == stride, so make sure to test
+      // it.
+      for (int ref_padding = 0; ref_padding < 2; ref_padding++) {
+        const int width = 1 << width_pow;
+        const int height = 1 << height_pow;
+        // Only the reference buffer may have a stride not equal to width.
+        Buffer<uint8_t> ref =
+            Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+        ASSERT_TRUE(ref.Init());
+        Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(pred.Init());
+        Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(avg_ref.Init());
+        Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(avg_chk.Init());
+
+        ref.Set(&rnd_, &ACMRandom::Rand8);
+        pred.Set(&rnd_, &ACMRandom::Rand8);
+
+        reference_pred(pred, ref, width, height, &avg_ref);
+        ASM_REGISTER_STATE_CHECK(
+            avg_pred_func_(avg_chk.TopLeftPixel(), pred.TopLeftPixel(), width,
+                           height, ref.TopLeftPixel(), ref.stride()));
+
+        EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
+        if (HasFailure()) {
+          printf("Width: %d Height: %d\n", width, height);
+          avg_chk.PrintDifference(avg_ref);
+          return;
+        }
+      }
+    }
+  }
+}
+
+TEST_P(AvgPredTest, CompareReferenceRandom) {
+  const int width = 64;
+  const int height = 32;
+  Buffer<uint8_t> ref = Buffer<uint8_t>(width, height, 8);
+  ASSERT_TRUE(ref.Init());
+  Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+  ASSERT_TRUE(pred.Init());
+  Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+  ASSERT_TRUE(avg_ref.Init());
+  Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+  ASSERT_TRUE(avg_chk.Init());
+
+  for (int i = 0; i < 500; ++i) {
+    ref.Set(&rnd_, &ACMRandom::Rand8);
+    pred.Set(&rnd_, &ACMRandom::Rand8);
+
+    reference_pred(pred, ref, width, height, &avg_ref);
+    ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk.TopLeftPixel(),
+                                            pred.TopLeftPixel(), width, height,
+                                            ref.TopLeftPixel(), ref.stride()));
+    EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
+    if (HasFailure()) {
+      printf("Width: %d Height: %d\n", width, height);
+      avg_chk.PrintDifference(avg_ref);
+      return;
+    }
+  }
+}
+
+TEST_P(AvgPredTest, DISABLED_Speed) {
+  for (int width_pow = 2; width_pow <= 6; ++width_pow) {
+    for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
+         ++height_pow) {
+      // Don't test 4x2 or 64x128
+      if (height_pow == 1 || height_pow == 7) continue;
+
+      for (int ref_padding = 0; ref_padding < 2; ref_padding++) {
+        const int width = 1 << width_pow;
+        const int height = 1 << height_pow;
+        Buffer<uint8_t> ref =
+            Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+        ASSERT_TRUE(ref.Init());
+        Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(pred.Init());
+        Buffer<uint8_t> avg = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(avg.Init());
+
+        ref.Set(&rnd_, &ACMRandom::Rand8);
+        pred.Set(&rnd_, &ACMRandom::Rand8);
+
+        vpx_usec_timer timer;
+        vpx_usec_timer_start(&timer);
+        for (int i = 0; i < 10000000 / (width * height); ++i) {
+          avg_pred_func_(avg.TopLeftPixel(), pred.TopLeftPixel(), width, height,
+                         ref.TopLeftPixel(), ref.stride());
+        }
+        vpx_usec_timer_mark(&timer);
+
+        const int elapsed_time =
+            static_cast<int>(vpx_usec_timer_elapsed(&timer));
+        printf("Average Test (ref_padding: %d) %dx%d time: %5d us\n",
+               ref_padding, width, height, elapsed_time);
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, AvgPredTest,
+                        ::testing::Values(&vpx_comp_avg_pred_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, AvgPredTest,
+                        ::testing::Values(&vpx_comp_avg_pred_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, AvgPredTest,
+                        ::testing::Values(&vpx_comp_avg_pred_neon));
+#endif  // HAVE_NEON
+
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(VSX, AvgPredTest,
+                        ::testing::Values(&vpx_comp_avg_pred_vsx));
+#endif  // HAVE_VSX
+}  // namespace
diff --git a/libs/libvpx/test/convolve_test.cc b/libs/libvpx/test/convolve_test.cc
index 432d1b009e..70f0b11a77 100644
--- a/libs/libvpx/test/convolve_test.cc
+++ b/libs/libvpx/test/convolve_test.cc
@@ -12,8 +12,8 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "./vpx_config.h"
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
@@ -25,6 +25,7 @@
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 namespace {
 
@@ -32,9 +33,15 @@ static const unsigned int kMaxDimension = 64;
 
 typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int filter_x_stride,
-                             const int16_t *filter_y, int filter_y_stride,
-                             int w, int h);
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h);
+
+typedef void (*WrapperFilterBlock2d8Func)(
+    const uint8_t *src_ptr, const unsigned int src_stride,
+    const int16_t *hfilter, const int16_t *vfilter, uint8_t *dst_ptr,
+    unsigned int dst_stride, unsigned int output_width,
+    unsigned int output_height, int use_highbd);
 
 struct ConvolveFunctions {
   ConvolveFunctions(ConvolveFunc copy, ConvolveFunc avg, ConvolveFunc h8,
@@ -43,25 +50,30 @@ struct ConvolveFunctions {
                     ConvolveFunc sh8_avg, ConvolveFunc sv8,
                     ConvolveFunc sv8_avg, ConvolveFunc shv8,
                     ConvolveFunc shv8_avg, int bd)
-      : copy_(copy), avg_(avg), h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg),
-        v8_avg_(v8_avg), hv8_avg_(hv8_avg), sh8_(sh8), sv8_(sv8), shv8_(shv8),
-        sh8_avg_(sh8_avg), sv8_avg_(sv8_avg), shv8_avg_(shv8_avg),
-        use_highbd_(bd) {}
+      : use_highbd_(bd) {
+    copy_[0] = copy;
+    copy_[1] = avg;
+    h8_[0] = h8;
+    h8_[1] = h8_avg;
+    v8_[0] = v8;
+    v8_[1] = v8_avg;
+    hv8_[0] = hv8;
+    hv8_[1] = hv8_avg;
+    sh8_[0] = sh8;
+    sh8_[1] = sh8_avg;
+    sv8_[0] = sv8;
+    sv8_[1] = sv8_avg;
+    shv8_[0] = shv8;
+    shv8_[1] = shv8_avg;
+  }
 
-  ConvolveFunc copy_;
-  ConvolveFunc avg_;
-  ConvolveFunc h8_;
-  ConvolveFunc v8_;
-  ConvolveFunc hv8_;
-  ConvolveFunc h8_avg_;
-  ConvolveFunc v8_avg_;
-  ConvolveFunc hv8_avg_;
-  ConvolveFunc sh8_;       // scaled horiz
-  ConvolveFunc sv8_;       // scaled vert
-  ConvolveFunc shv8_;      // scaled horiz/vert
-  ConvolveFunc sh8_avg_;   // scaled avg horiz
-  ConvolveFunc sv8_avg_;   // scaled avg vert
-  ConvolveFunc shv8_avg_;  // scaled avg horiz/vert
+  ConvolveFunc copy_[2];
+  ConvolveFunc h8_[2];
+  ConvolveFunc v8_[2];
+  ConvolveFunc hv8_[2];
+  ConvolveFunc sh8_[2];   // scaled horiz
+  ConvolveFunc sv8_[2];   // scaled vert
+  ConvolveFunc shv8_[2];  // scaled horiz/vert
   int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
 };
 
@@ -82,7 +94,7 @@ typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
 uint8_t clip_pixel(int x) { return x < 0 ? 0 : x > 255 ? 255 : x; }
 
 void filter_block2d_8_c(const uint8_t *src_ptr, const unsigned int src_stride,
-                        const int16_t *HFilter, const int16_t *VFilter,
+                        const int16_t *hfilter, const int16_t *vfilter,
                         uint8_t *dst_ptr, unsigned int dst_stride,
                         unsigned int output_width, unsigned int output_height) {
   // Between passes, we use an intermediate buffer whose height is extended to
@@ -112,10 +124,10 @@ void filter_block2d_8_c(const uint8_t *src_ptr, const unsigned int src_stride,
   for (i = 0; i < intermediate_height; ++i) {
     for (j = 0; j < output_width; ++j) {
       // Apply filter...
-      const int temp = (src_ptr[0] * HFilter[0]) + (src_ptr[1] * HFilter[1]) +
-                       (src_ptr[2] * HFilter[2]) + (src_ptr[3] * HFilter[3]) +
-                       (src_ptr[4] * HFilter[4]) + (src_ptr[5] * HFilter[5]) +
-                       (src_ptr[6] * HFilter[6]) + (src_ptr[7] * HFilter[7]) +
+      const int temp = (src_ptr[0] * hfilter[0]) + (src_ptr[1] * hfilter[1]) +
+                       (src_ptr[2] * hfilter[2]) + (src_ptr[3] * hfilter[3]) +
+                       (src_ptr[4] * hfilter[4]) + (src_ptr[5] * hfilter[5]) +
+                       (src_ptr[6] * hfilter[6]) + (src_ptr[7] * hfilter[7]) +
                        (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
       // Normalize back to 0-255...
@@ -133,10 +145,10 @@ void filter_block2d_8_c(const uint8_t *src_ptr, const unsigned int src_stride,
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
       // Apply filter...
-      const int temp = (src_ptr[0] * VFilter[0]) + (src_ptr[1] * VFilter[1]) +
-                       (src_ptr[2] * VFilter[2]) + (src_ptr[3] * VFilter[3]) +
-                       (src_ptr[4] * VFilter[4]) + (src_ptr[5] * VFilter[5]) +
-                       (src_ptr[6] * VFilter[6]) + (src_ptr[7] * VFilter[7]) +
+      const int temp = (src_ptr[0] * vfilter[0]) + (src_ptr[1] * vfilter[1]) +
+                       (src_ptr[2] * vfilter[2]) + (src_ptr[3] * vfilter[3]) +
+                       (src_ptr[4] * vfilter[4]) + (src_ptr[5] * vfilter[5]) +
+                       (src_ptr[6] * vfilter[6]) + (src_ptr[7] * vfilter[7]) +
                        (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
       // Normalize back to 0-255...
@@ -162,7 +174,7 @@ void block2d_average_c(uint8_t *src, unsigned int src_stride,
 
 void filter_average_block2d_8_c(const uint8_t *src_ptr,
                                 const unsigned int src_stride,
-                                const int16_t *HFilter, const int16_t *VFilter,
+                                const int16_t *hfilter, const int16_t *vfilter,
                                 uint8_t *dst_ptr, unsigned int dst_stride,
                                 unsigned int output_width,
                                 unsigned int output_height) {
@@ -170,7 +182,7 @@ void filter_average_block2d_8_c(const uint8_t *src_ptr,
 
   assert(output_width <= kMaxDimension);
   assert(output_height <= kMaxDimension);
-  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+  filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, tmp, 64,
                      output_width, output_height);
   block2d_average_c(tmp, 64, dst_ptr, dst_stride, output_width, output_height);
 }
@@ -178,7 +190,7 @@ void filter_average_block2d_8_c(const uint8_t *src_ptr,
 #if CONFIG_VP9_HIGHBITDEPTH
 void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
                                const unsigned int src_stride,
-                               const int16_t *HFilter, const int16_t *VFilter,
+                               const int16_t *hfilter, const int16_t *vfilter,
                                uint16_t *dst_ptr, unsigned int dst_stride,
                                unsigned int output_width,
                                unsigned int output_height, int bd) {
@@ -210,10 +222,10 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
     for (i = 0; i < intermediate_height; ++i) {
       for (j = 0; j < output_width; ++j) {
         // Apply filter...
-        const int temp = (src_ptr[0] * HFilter[0]) + (src_ptr[1] * HFilter[1]) +
-                         (src_ptr[2] * HFilter[2]) + (src_ptr[3] * HFilter[3]) +
-                         (src_ptr[4] * HFilter[4]) + (src_ptr[5] * HFilter[5]) +
-                         (src_ptr[6] * HFilter[6]) + (src_ptr[7] * HFilter[7]) +
+        const int temp = (src_ptr[0] * hfilter[0]) + (src_ptr[1] * hfilter[1]) +
+                         (src_ptr[2] * hfilter[2]) + (src_ptr[3] * hfilter[3]) +
+                         (src_ptr[4] * hfilter[4]) + (src_ptr[5] * hfilter[5]) +
+                         (src_ptr[6] * hfilter[6]) + (src_ptr[7] * hfilter[7]) +
                          (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
         // Normalize back to 0-255...
@@ -234,10 +246,10 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
     for (i = 0; i < output_height; ++i) {
       for (j = 0; j < output_width; ++j) {
         // Apply filter...
-        const int temp = (src_ptr[0] * VFilter[0]) + (src_ptr[1] * VFilter[1]) +
-                         (src_ptr[2] * VFilter[2]) + (src_ptr[3] * VFilter[3]) +
-                         (src_ptr[4] * VFilter[4]) + (src_ptr[5] * VFilter[5]) +
-                         (src_ptr[6] * VFilter[6]) + (src_ptr[7] * VFilter[7]) +
+        const int temp = (src_ptr[0] * vfilter[0]) + (src_ptr[1] * vfilter[1]) +
+                         (src_ptr[2] * vfilter[2]) + (src_ptr[3] * vfilter[3]) +
+                         (src_ptr[4] * vfilter[4]) + (src_ptr[5] * vfilter[5]) +
+                         (src_ptr[6] * vfilter[6]) + (src_ptr[7] * vfilter[7]) +
                          (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
         // Normalize back to 0-255...
@@ -265,20 +277,64 @@ void highbd_block2d_average_c(uint16_t *src, unsigned int src_stride,
 
 void highbd_filter_average_block2d_8_c(
     const uint16_t *src_ptr, const unsigned int src_stride,
-    const int16_t *HFilter, const int16_t *VFilter, uint16_t *dst_ptr,
+    const int16_t *hfilter, const int16_t *vfilter, uint16_t *dst_ptr,
     unsigned int dst_stride, unsigned int output_width,
     unsigned int output_height, int bd) {
   uint16_t tmp[kMaxDimension * kMaxDimension];
 
   assert(output_width <= kMaxDimension);
   assert(output_height <= kMaxDimension);
-  highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+  highbd_filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, tmp, 64,
                             output_width, output_height, bd);
   highbd_block2d_average_c(tmp, 64, dst_ptr, dst_stride, output_width,
                            output_height);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+void wrapper_filter_average_block2d_8_c(
+    const uint8_t *src_ptr, const unsigned int src_stride,
+    const int16_t *hfilter, const int16_t *vfilter, uint8_t *dst_ptr,
+    unsigned int dst_stride, unsigned int output_width,
+    unsigned int output_height, int use_highbd) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (use_highbd == 0) {
+    filter_average_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr,
+                               dst_stride, output_width, output_height);
+  } else {
+    highbd_filter_average_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride,
+                                      hfilter, vfilter,
+                                      CAST_TO_SHORTPTR(dst_ptr), dst_stride,
+                                      output_width, output_height, use_highbd);
+  }
+#else
+  ASSERT_EQ(0, use_highbd);
+  filter_average_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr,
+                             dst_stride, output_width, output_height);
+#endif
+}
+
+void wrapper_filter_block2d_8_c(const uint8_t *src_ptr,
+                                const unsigned int src_stride,
+                                const int16_t *hfilter, const int16_t *vfilter,
+                                uint8_t *dst_ptr, unsigned int dst_stride,
+                                unsigned int output_width,
+                                unsigned int output_height, int use_highbd) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (use_highbd == 0) {
+    filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr,
+                       dst_stride, output_width, output_height);
+  } else {
+    highbd_filter_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride, hfilter,
+                              vfilter, CAST_TO_SHORTPTR(dst_ptr), dst_stride,
+                              output_width, output_height, use_highbd);
+  }
+#else
+  ASSERT_EQ(0, use_highbd);
+  filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr, dst_stride,
+                     output_width, output_height);
+#endif
+}
+
 class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
  public:
   static void SetUpTestCase() {
@@ -404,7 +460,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
     if (UUT_->use_highbd_ == 0) {
       return input_ + offset;
     } else {
-      return CONVERT_TO_BYTEPTR(input16_) + offset;
+      return CAST_TO_BYTEPTR(input16_ + offset);
     }
 #else
     return input_ + offset;
@@ -417,7 +473,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
     if (UUT_->use_highbd_ == 0) {
       return output_ + offset;
     } else {
-      return CONVERT_TO_BYTEPTR(output16_) + offset;
+      return CAST_TO_BYTEPTR(output16_ + offset);
     }
 #else
     return output_ + offset;
@@ -430,7 +486,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
     if (UUT_->use_highbd_ == 0) {
       return output_ref_ + offset;
     } else {
-      return CONVERT_TO_BYTEPTR(output16_ref_) + offset;
+      return CAST_TO_BYTEPTR(output16_ref_ + offset);
     }
 #else
     return output_ref_ + offset;
@@ -442,7 +498,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
     if (UUT_->use_highbd_ == 0) {
       return list[index];
     } else {
-      return CONVERT_TO_SHORTPTR(list)[index];
+      return CAST_TO_SHORTPTR(list)[index];
     }
 #else
     return list[index];
@@ -454,57 +510,13 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
     if (UUT_->use_highbd_ == 0) {
       list[index] = (uint8_t)val;
     } else {
-      CONVERT_TO_SHORTPTR(list)[index] = val;
+      CAST_TO_SHORTPTR(list)[index] = val;
     }
 #else
     list[index] = (uint8_t)val;
 #endif
   }
 
-  void wrapper_filter_average_block2d_8_c(
-      const uint8_t *src_ptr, const unsigned int src_stride,
-      const int16_t *HFilter, const int16_t *VFilter, uint8_t *dst_ptr,
-      unsigned int dst_stride, unsigned int output_width,
-      unsigned int output_height) {
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (UUT_->use_highbd_ == 0) {
-      filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr,
-                                 dst_stride, output_width, output_height);
-    } else {
-      highbd_filter_average_block2d_8_c(
-          CONVERT_TO_SHORTPTR(src_ptr), src_stride, HFilter, VFilter,
-          CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, output_width, output_height,
-          UUT_->use_highbd_);
-    }
-#else
-    filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr,
-                               dst_stride, output_width, output_height);
-#endif
-  }
-
-  void wrapper_filter_block2d_8_c(const uint8_t *src_ptr,
-                                  const unsigned int src_stride,
-                                  const int16_t *HFilter,
-                                  const int16_t *VFilter, uint8_t *dst_ptr,
-                                  unsigned int dst_stride,
-                                  unsigned int output_width,
-                                  unsigned int output_height) {
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (UUT_->use_highbd_ == 0) {
-      filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr,
-                         dst_stride, output_width, output_height);
-    } else {
-      highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
-                                HFilter, VFilter, CONVERT_TO_SHORTPTR(dst_ptr),
-                                dst_stride, output_width, output_height,
-                                UUT_->use_highbd_);
-    }
-#else
-    filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr,
-                       dst_stride, output_width, output_height);
-#endif
-  }
-
   const ConvolveFunctions *UUT_;
   static uint8_t *input_;
   static uint8_t *output_;
@@ -528,12 +540,167 @@ uint16_t *ConvolveTest::output16_ref_ = NULL;
 
 TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); }
 
+TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,
+                   width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve_copy_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_Avg_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,
+                   width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve_avg_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_Scale_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                   width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve_scale_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->hv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                  width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Horiz_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->h8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                 width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_horiz_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->v8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                 width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_vert_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->hv8_[1](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                  width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_avg_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
 TEST_P(ConvolveTest, Copy) {
   uint8_t *const in = input();
   uint8_t *const out = output();
 
-  ASM_REGISTER_STATE_CHECK(UUT_->copy_(in, kInputStride, out, kOutputStride,
-                                       NULL, 0, NULL, 0, Width(), Height()));
+  ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride,
+                                          NULL, 0, 0, 0, 0, Width(), Height()));
 
   CheckGuardBlocks();
 
@@ -551,8 +718,8 @@ TEST_P(ConvolveTest, Avg) {
   uint8_t *const out_ref = output_ref();
   CopyOutputToRef();
 
-  ASM_REGISTER_STATE_CHECK(UUT_->avg_(in, kInputStride, out, kOutputStride,
-                                      NULL, 0, NULL, 0, Width(), Height()));
+  ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride,
+                                          NULL, 0, 0, 0, 0, Width(), Height()));
 
   CheckGuardBlocks();
 
@@ -569,12 +736,10 @@ TEST_P(ConvolveTest, Avg) {
 TEST_P(ConvolveTest, CopyHoriz) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
 
-  ASM_REGISTER_STATE_CHECK(UUT_->sh8_(in, kInputStride, out, kOutputStride,
-                                      filter8, 16, filter8, 16, Width(),
-                                      Height()));
+  ASM_REGISTER_STATE_CHECK(UUT_->sh8_[0](in, kInputStride, out, kOutputStride,
+                                         vp9_filter_kernels[0], 0, 16, 0, 16,
+                                         Width(), Height()));
 
   CheckGuardBlocks();
 
@@ -589,12 +754,10 @@ TEST_P(ConvolveTest, CopyHoriz) {
 TEST_P(ConvolveTest, CopyVert) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
 
-  ASM_REGISTER_STATE_CHECK(UUT_->sv8_(in, kInputStride, out, kOutputStride,
-                                      filter8, 16, filter8, 16, Width(),
-                                      Height()));
+  ASM_REGISTER_STATE_CHECK(UUT_->sv8_[0](in, kInputStride, out, kOutputStride,
+                                         vp9_filter_kernels[0], 0, 16, 0, 16,
+                                         Width(), Height()));
 
   CheckGuardBlocks();
 
@@ -609,12 +772,10 @@ TEST_P(ConvolveTest, CopyVert) {
 TEST_P(ConvolveTest, Copy2D) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
 
-  ASM_REGISTER_STATE_CHECK(UUT_->shv8_(in, kInputStride, out, kOutputStride,
-                                       filter8, 16, filter8, 16, Width(),
-                                       Height()));
+  ASM_REGISTER_STATE_CHECK(UUT_->shv8_[0](in, kInputStride, out, kOutputStride,
+                                          vp9_filter_kernels[0], 0, 16, 0, 16,
+                                          Width(), Height()));
 
   CheckGuardBlocks();
 
@@ -650,138 +811,84 @@ TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
   }
 }
 
-const int16_t kInvalidFilter[8] = { 0 };
+const WrapperFilterBlock2d8Func wrapper_filter_block2d_8[2] = {
+  wrapper_filter_block2d_8_c, wrapper_filter_average_block2d_8_c
+};
 
 TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
-  uint8_t *const in = input();
-  uint8_t *const out = output();
+  for (int i = 0; i < 2; ++i) {
+    uint8_t *const in = input();
+    uint8_t *const out = output();
 #if CONFIG_VP9_HIGHBITDEPTH
-  uint8_t ref8[kOutputStride * kMaxDimension];
-  uint16_t ref16[kOutputStride * kMaxDimension];
-  uint8_t *ref;
-  if (UUT_->use_highbd_ == 0) {
-    ref = ref8;
-  } else {
-    ref = CONVERT_TO_BYTEPTR(ref16);
-  }
+    uint8_t ref8[kOutputStride * kMaxDimension];
+    uint16_t ref16[kOutputStride * kMaxDimension];
+    uint8_t *ref;
+    if (UUT_->use_highbd_ == 0) {
+      ref = ref8;
+    } else {
+      ref = CAST_TO_BYTEPTR(ref16);
+    }
 #else
-  uint8_t ref[kOutputStride * kMaxDimension];
+    uint8_t ref[kOutputStride * kMaxDimension];
 #endif
 
-  for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
-    const InterpKernel *filters =
-        vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
-
-    for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
-      for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
-        wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x],
-                                   filters[filter_y], ref, kOutputStride,
-                                   Width(), Height());
-
-        if (filter_x && filter_y)
-          ASM_REGISTER_STATE_CHECK(UUT_->hv8_(
-              in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-              filters[filter_y], 16, Width(), Height()));
-        else if (filter_y)
-          ASM_REGISTER_STATE_CHECK(
-              UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter,
-                        16, filters[filter_y], 16, Width(), Height()));
-        else if (filter_x)
-          ASM_REGISTER_STATE_CHECK(
-              UUT_->h8_(in, kInputStride, out, kOutputStride, filters[filter_x],
-                        16, kInvalidFilter, 16, Width(), Height()));
-        else
-          ASM_REGISTER_STATE_CHECK(
-              UUT_->copy_(in, kInputStride, out, kOutputStride, kInvalidFilter,
-                          0, kInvalidFilter, 0, Width(), Height()));
-
-        CheckGuardBlocks();
-
-        for (int y = 0; y < Height(); ++y) {
-          for (int x = 0; x < Width(); ++x)
-            ASSERT_EQ(lookup(ref, y * kOutputStride + x),
-                      lookup(out, y * kOutputStride + x))
-                << "mismatch at (" << x << "," << y << "), "
-                << "filters (" << filter_bank << "," << filter_x << ","
-                << filter_y << ")";
+    // Populate ref and out with some random data
+    ::libvpx_test::ACMRandom prng;
+    for (int y = 0; y < Height(); ++y) {
+      for (int x = 0; x < Width(); ++x) {
+        uint16_t r;
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
+          r = prng.Rand8Extremes();
+        } else {
+          r = prng.Rand16() & mask_;
         }
-      }
-    }
-  }
-}
-
-TEST_P(ConvolveTest, MatchesReferenceAveragingSubpixelFilter) {
-  uint8_t *const in = input();
-  uint8_t *const out = output();
-#if CONFIG_VP9_HIGHBITDEPTH
-  uint8_t ref8[kOutputStride * kMaxDimension];
-  uint16_t ref16[kOutputStride * kMaxDimension];
-  uint8_t *ref;
-  if (UUT_->use_highbd_ == 0) {
-    ref = ref8;
-  } else {
-    ref = CONVERT_TO_BYTEPTR(ref16);
-  }
 #else
-  uint8_t ref[kOutputStride * kMaxDimension];
-#endif
-
-  // Populate ref and out with some random data
-  ::libvpx_test::ACMRandom prng;
-  for (int y = 0; y < Height(); ++y) {
-    for (int x = 0; x < Width(); ++x) {
-      uint16_t r;
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
         r = prng.Rand8Extremes();
-      } else {
-        r = prng.Rand16() & mask_;
-      }
-#else
-      r = prng.Rand8Extremes();
 #endif
 
-      assign_val(out, y * kOutputStride + x, r);
-      assign_val(ref, y * kOutputStride + x, r);
+        assign_val(out, y * kOutputStride + x, r);
+        assign_val(ref, y * kOutputStride + x, r);
+      }
     }
-  }
 
-  for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
-    const InterpKernel *filters =
-        vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
+    for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+      const InterpKernel *filters =
+          vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
 
-    for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
-      for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
-        wrapper_filter_average_block2d_8_c(in, kInputStride, filters[filter_x],
-                                           filters[filter_y], ref,
-                                           kOutputStride, Width(), Height());
+      for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
+        for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
+          wrapper_filter_block2d_8[i](in, kInputStride, filters[filter_x],
+                                      filters[filter_y], ref, kOutputStride,
+                                      Width(), Height(), UUT_->use_highbd_);
 
-        if (filter_x && filter_y)
-          ASM_REGISTER_STATE_CHECK(UUT_->hv8_avg_(
-              in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-              filters[filter_y], 16, Width(), Height()));
-        else if (filter_y)
-          ASM_REGISTER_STATE_CHECK(UUT_->v8_avg_(
-              in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
-              filters[filter_y], 16, Width(), Height()));
-        else if (filter_x)
-          ASM_REGISTER_STATE_CHECK(UUT_->h8_avg_(
-              in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-              kInvalidFilter, 16, Width(), Height()));
-        else
-          ASM_REGISTER_STATE_CHECK(
-              UUT_->avg_(in, kInputStride, out, kOutputStride, kInvalidFilter,
-                         0, kInvalidFilter, 0, Width(), Height()));
+          if (filter_x && filter_y)
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->hv8_[i](in, kInputStride, out, kOutputStride, filters,
+                              filter_x, 16, filter_y, 16, Width(), Height()));
+          else if (filter_y)
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->v8_[i](in, kInputStride, out, kOutputStride, filters, 0,
+                             16, filter_y, 16, Width(), Height()));
+          else if (filter_x)
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters,
+                             filter_x, 16, 0, 16, Width(), Height()));
+          else
+            ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](in, kInputStride, out,
+                                                    kOutputStride, NULL, 0, 0,
+                                                    0, 0, Width(), Height()));
 
-        CheckGuardBlocks();
+          CheckGuardBlocks();
 
-        for (int y = 0; y < Height(); ++y) {
-          for (int x = 0; x < Width(); ++x)
-            ASSERT_EQ(lookup(ref, y * kOutputStride + x),
-                      lookup(out, y * kOutputStride + x))
-                << "mismatch at (" << x << "," << y << "), "
-                << "filters (" << filter_bank << "," << filter_x << ","
-                << filter_y << ")";
+          for (int y = 0; y < Height(); ++y) {
+            for (int x = 0; x < Width(); ++x)
+              ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                        lookup(out, y * kOutputStride + x))
+                  << "mismatch at (" << x << "," << y << "), "
+                  << "filters (" << filter_bank << "," << filter_x << ","
+                  << filter_y << ")";
+          }
         }
       }
     }
@@ -798,7 +905,7 @@ TEST_P(ConvolveTest, FilterExtremes) {
   if (UUT_->use_highbd_ == 0) {
     ref = ref8;
   } else {
-    ref = CONVERT_TO_BYTEPTR(ref16);
+    ref = CAST_TO_BYTEPTR(ref16);
   }
 #else
   uint8_t ref[kOutputStride * kMaxDimension];
@@ -852,23 +959,23 @@ TEST_P(ConvolveTest, FilterExtremes) {
           for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
             wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x],
                                        filters[filter_y], ref, kOutputStride,
-                                       Width(), Height());
+                                       Width(), Height(), UUT_->use_highbd_);
             if (filter_x && filter_y)
-              ASM_REGISTER_STATE_CHECK(UUT_->hv8_(
-                  in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                  filters[filter_y], 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->hv8_[0](in, kInputStride, out, kOutputStride, filters,
+                                filter_x, 16, filter_y, 16, Width(), Height()));
             else if (filter_y)
-              ASM_REGISTER_STATE_CHECK(UUT_->v8_(
-                  in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
-                  filters[filter_y], 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->v8_[0](in, kInputStride, out, kOutputStride, filters, 0,
+                               16, filter_y, 16, Width(), Height()));
             else if (filter_x)
-              ASM_REGISTER_STATE_CHECK(UUT_->h8_(
-                  in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                  kInvalidFilter, 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters,
+                               filter_x, 16, 0, 16, Width(), Height()));
             else
-              ASM_REGISTER_STATE_CHECK(UUT_->copy_(
-                  in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
-                  kInvalidFilter, 0, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out,
+                                                      kOutputStride, NULL, 0, 0,
+                                                      0, 0, Width(), Height()));
 
             for (int y = 0; y < Height(); ++y) {
               for (int x = 0; x < Width(); ++x)
@@ -887,33 +994,51 @@ TEST_P(ConvolveTest, FilterExtremes) {
 
 /* This test exercises that enough rows and columns are filtered with every
    possible initial fractional positions and scaling steps. */
+#if !CONFIG_VP9_HIGHBITDEPTH
+static const ConvolveFunc scaled_2d_c_funcs[2] = { vpx_scaled_2d_c,
+                                                   vpx_scaled_avg_2d_c };
+
 TEST_P(ConvolveTest, CheckScalingFiltering) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP];
+  uint8_t ref[kOutputStride * kMaxDimension];
 
-  SetConstantInput(127);
+  ::libvpx_test::ACMRandom prng;
+  for (int y = 0; y < Height(); ++y) {
+    for (int x = 0; x < Width(); ++x) {
+      const uint16_t r = prng.Rand8Extremes();
+      assign_val(in, y * kInputStride + x, r);
+    }
+  }
 
-  for (int frac = 0; frac < 16; ++frac) {
-    for (int step = 1; step <= 32; ++step) {
-      /* Test the horizontal and vertical filters in combination. */
-      ASM_REGISTER_STATE_CHECK(UUT_->shv8_(in, kInputStride, out, kOutputStride,
-                                           eighttap[frac], step, eighttap[frac],
-                                           step, Width(), Height()));
+  for (int i = 0; i < 2; ++i) {
+    for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) {
+      const InterpKernel *const eighttap = vp9_filter_kernels[filter_type];
+      for (int frac = 0; frac < 16; ++frac) {
+        for (int step = 1; step <= 32; ++step) {
+          /* Test the horizontal and vertical filters in combination. */
+          scaled_2d_c_funcs[i](in, kInputStride, ref, kOutputStride, eighttap,
+                               frac, step, frac, step, Width(), Height());
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->shv8_[i](in, kInputStride, out, kOutputStride, eighttap,
+                             frac, step, frac, step, Width(), Height()));
 
-      CheckGuardBlocks();
+          CheckGuardBlocks();
 
-      for (int y = 0; y < Height(); ++y) {
-        for (int x = 0; x < Width(); ++x) {
-          ASSERT_EQ(lookup(in, y * kInputStride + x),
-                    lookup(out, y * kOutputStride + x))
-              << "x == " << x << ", y == " << y << ", frac == " << frac
-              << ", step == " << step;
+          for (int y = 0; y < Height(); ++y) {
+            for (int x = 0; x < Width(); ++x) {
+              ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                        lookup(out, y * kOutputStride + x))
+                  << "x == " << x << ", y == " << y << ", frac == " << frac
+                  << ", step == " << step;
+            }
+          }
         }
       }
     }
   }
 }
+#endif
 
 using std::tr1::make_tuple;
 
@@ -921,11 +1046,13 @@ using std::tr1::make_tuple;
 #define WRAP(func, bd)                                                       \
   void wrap_##func##_##bd(                                                   \
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride,    \
-      const int16_t *filter_y, int filter_y_stride, int w, int h) {          \
-    vpx_highbd_##func(src, src_stride, dst, dst_stride, filter_x,            \
-                      filter_x_stride, filter_y, filter_y_stride, w, h, bd); \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
+    vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride,   \
+                      reinterpret_cast<uint16_t *>(dst), dst_stride, filter, \
+                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);         \
   }
+
 #if HAVE_SSE2 && ARCH_X86_64
 WRAP(convolve_copy_sse2, 8)
 WRAP(convolve_avg_sse2, 8)
@@ -953,6 +1080,62 @@ WRAP(convolve8_sse2, 12)
 WRAP(convolve8_avg_sse2, 12)
 #endif  // HAVE_SSE2 && ARCH_X86_64
 
+#if HAVE_AVX2
+WRAP(convolve_copy_avx2, 8)
+WRAP(convolve_avg_avx2, 8)
+WRAP(convolve8_horiz_avx2, 8)
+WRAP(convolve8_avg_horiz_avx2, 8)
+WRAP(convolve8_vert_avx2, 8)
+WRAP(convolve8_avg_vert_avx2, 8)
+WRAP(convolve8_avx2, 8)
+WRAP(convolve8_avg_avx2, 8)
+
+WRAP(convolve_copy_avx2, 10)
+WRAP(convolve_avg_avx2, 10)
+WRAP(convolve8_avx2, 10)
+WRAP(convolve8_horiz_avx2, 10)
+WRAP(convolve8_vert_avx2, 10)
+WRAP(convolve8_avg_avx2, 10)
+WRAP(convolve8_avg_horiz_avx2, 10)
+WRAP(convolve8_avg_vert_avx2, 10)
+
+WRAP(convolve_copy_avx2, 12)
+WRAP(convolve_avg_avx2, 12)
+WRAP(convolve8_avx2, 12)
+WRAP(convolve8_horiz_avx2, 12)
+WRAP(convolve8_vert_avx2, 12)
+WRAP(convolve8_avg_avx2, 12)
+WRAP(convolve8_avg_horiz_avx2, 12)
+WRAP(convolve8_avg_vert_avx2, 12)
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+WRAP(convolve_copy_neon, 8)
+WRAP(convolve_avg_neon, 8)
+WRAP(convolve_copy_neon, 10)
+WRAP(convolve_avg_neon, 10)
+WRAP(convolve_copy_neon, 12)
+WRAP(convolve_avg_neon, 12)
+WRAP(convolve8_horiz_neon, 8)
+WRAP(convolve8_avg_horiz_neon, 8)
+WRAP(convolve8_vert_neon, 8)
+WRAP(convolve8_avg_vert_neon, 8)
+WRAP(convolve8_neon, 8)
+WRAP(convolve8_avg_neon, 8)
+WRAP(convolve8_horiz_neon, 10)
+WRAP(convolve8_avg_horiz_neon, 10)
+WRAP(convolve8_vert_neon, 10)
+WRAP(convolve8_avg_vert_neon, 10)
+WRAP(convolve8_neon, 10)
+WRAP(convolve8_avg_neon, 10)
+WRAP(convolve8_horiz_neon, 12)
+WRAP(convolve8_avg_horiz_neon, 12)
+WRAP(convolve8_vert_neon, 12)
+WRAP(convolve8_avg_vert_neon, 12)
+WRAP(convolve8_neon, 12)
+WRAP(convolve8_avg_neon, 12)
+#endif  // HAVE_NEON
+
 WRAP(convolve_copy_c, 8)
 WRAP(convolve_avg_c, 8)
 WRAP(convolve8_horiz_c, 8)
@@ -1071,39 +1254,90 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
                         ::testing::ValuesIn(kArrayConvolve8_ssse3));
 #endif
 
-#if HAVE_AVX2 && HAVE_SSSE3
+#if HAVE_AVX2
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_avx2(
+    wrap_convolve_copy_avx2_8, wrap_convolve_avg_avx2_8,
+    wrap_convolve8_horiz_avx2_8, wrap_convolve8_avg_horiz_avx2_8,
+    wrap_convolve8_vert_avx2_8, wrap_convolve8_avg_vert_avx2_8,
+    wrap_convolve8_avx2_8, wrap_convolve8_avg_avx2_8, wrap_convolve8_horiz_c_8,
+    wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8,
+    wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
+const ConvolveFunctions convolve10_avx2(
+    wrap_convolve_copy_avx2_10, wrap_convolve_avg_avx2_10,
+    wrap_convolve8_horiz_avx2_10, wrap_convolve8_avg_horiz_avx2_10,
+    wrap_convolve8_vert_avx2_10, wrap_convolve8_avg_vert_avx2_10,
+    wrap_convolve8_avx2_10, wrap_convolve8_avg_avx2_10,
+    wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
+    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
+    wrap_convolve8_avg_c_10, 10);
+const ConvolveFunctions convolve12_avx2(
+    wrap_convolve_copy_avx2_12, wrap_convolve_avg_avx2_12,
+    wrap_convolve8_horiz_avx2_12, wrap_convolve8_avg_horiz_avx2_12,
+    wrap_convolve8_vert_avx2_12, wrap_convolve8_avg_vert_avx2_12,
+    wrap_convolve8_avx2_12, wrap_convolve8_avg_avx2_12,
+    wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
+    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
+    wrap_convolve8_avg_c_12, 12);
+const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2),
+                                               ALL_SIZES(convolve10_avx2),
+                                               ALL_SIZES(convolve12_avx2) };
+INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_avx2));
+#else   // !CONFIG_VP9_HIGHBITDEPTH
 const ConvolveFunctions convolve8_avx2(
     vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2,
-    vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2,
-    vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3,
+    vpx_convolve8_avg_horiz_avx2, vpx_convolve8_vert_avx2,
+    vpx_convolve8_avg_vert_avx2, vpx_convolve8_avx2, vpx_convolve8_avg_avx2,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
     vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
-
 const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
 INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
                         ::testing::ValuesIn(kArrayConvolve8_avx2));
-#endif  // HAVE_AVX2 && HAVE_SSSE3
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_AVX2
 
 #if HAVE_NEON
-#if HAVE_NEON_ASM
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_neon(
+    wrap_convolve_copy_neon_8, wrap_convolve_avg_neon_8,
+    wrap_convolve8_horiz_neon_8, wrap_convolve8_avg_horiz_neon_8,
+    wrap_convolve8_vert_neon_8, wrap_convolve8_avg_vert_neon_8,
+    wrap_convolve8_neon_8, wrap_convolve8_avg_neon_8,
+    wrap_convolve8_horiz_neon_8, wrap_convolve8_avg_horiz_neon_8,
+    wrap_convolve8_vert_neon_8, wrap_convolve8_avg_vert_neon_8,
+    wrap_convolve8_neon_8, wrap_convolve8_avg_neon_8, 8);
+const ConvolveFunctions convolve10_neon(
+    wrap_convolve_copy_neon_10, wrap_convolve_avg_neon_10,
+    wrap_convolve8_horiz_neon_10, wrap_convolve8_avg_horiz_neon_10,
+    wrap_convolve8_vert_neon_10, wrap_convolve8_avg_vert_neon_10,
+    wrap_convolve8_neon_10, wrap_convolve8_avg_neon_10,
+    wrap_convolve8_horiz_neon_10, wrap_convolve8_avg_horiz_neon_10,
+    wrap_convolve8_vert_neon_10, wrap_convolve8_avg_vert_neon_10,
+    wrap_convolve8_neon_10, wrap_convolve8_avg_neon_10, 10);
+const ConvolveFunctions convolve12_neon(
+    wrap_convolve_copy_neon_12, wrap_convolve_avg_neon_12,
+    wrap_convolve8_horiz_neon_12, wrap_convolve8_avg_horiz_neon_12,
+    wrap_convolve8_vert_neon_12, wrap_convolve8_avg_vert_neon_12,
+    wrap_convolve8_neon_12, wrap_convolve8_avg_neon_12,
+    wrap_convolve8_horiz_neon_12, wrap_convolve8_avg_horiz_neon_12,
+    wrap_convolve8_vert_neon_12, wrap_convolve8_avg_vert_neon_12,
+    wrap_convolve8_neon_12, wrap_convolve8_avg_neon_12, 12);
+const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon),
+                                              ALL_SIZES(convolve10_neon),
+                                              ALL_SIZES(convolve12_neon) };
+#else
 const ConvolveFunctions convolve8_neon(
     vpx_convolve_copy_neon, vpx_convolve_avg_neon, vpx_convolve8_horiz_neon,
     vpx_convolve8_avg_horiz_neon, vpx_convolve8_vert_neon,
     vpx_convolve8_avg_vert_neon, vpx_convolve8_neon, vpx_convolve8_avg_neon,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
-    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
-#else   // HAVE_NEON
-const ConvolveFunctions convolve8_neon(
-    vpx_convolve_copy_neon, vpx_convolve_avg_neon, vpx_convolve8_horiz_neon,
-    vpx_convolve8_avg_horiz_neon, vpx_convolve8_vert_neon,
-    vpx_convolve8_avg_vert_neon, vpx_convolve8_neon, vpx_convolve8_avg_neon,
-    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
-    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
-#endif  // HAVE_NEON_ASM
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_neon, vpx_scaled_avg_2d_c, 0);
 
-const ConvolveParam kArrayConvolve8_neon[] = { ALL_SIZES(convolve8_neon) };
+const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon) };
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest,
-                        ::testing::ValuesIn(kArrayConvolve8_neon));
+                        ::testing::ValuesIn(kArrayConvolve_neon));
 #endif  // HAVE_NEON
 
 #if HAVE_DSPR2
@@ -1125,10 +1359,22 @@ const ConvolveFunctions convolve8_msa(
     vpx_convolve8_avg_horiz_msa, vpx_convolve8_vert_msa,
     vpx_convolve8_avg_vert_msa, vpx_convolve8_msa, vpx_convolve8_avg_msa,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
-    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_msa, vpx_scaled_avg_2d_c, 0);
 
 const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES(convolve8_msa) };
 INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest,
                         ::testing::ValuesIn(kArrayConvolve8_msa));
 #endif  // HAVE_MSA
+
+#if HAVE_VSX
+const ConvolveFunctions convolve8_vsx(
+    vpx_convolve_copy_vsx, vpx_convolve_avg_vsx, vpx_convolve8_horiz_vsx,
+    vpx_convolve8_avg_horiz_vsx, vpx_convolve8_vert_vsx,
+    vpx_convolve8_avg_vert_vsx, vpx_convolve8_vsx, vpx_convolve8_avg_vsx,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+const ConvolveParam kArrayConvolve_vsx[] = { ALL_SIZES(convolve8_vsx) };
+INSTANTIATE_TEST_CASE_P(VSX, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve_vsx));
+#endif  // HAVE_VSX
 }  // namespace
diff --git a/libs/libvpx/test/datarate_test.cc b/libs/libvpx/test/datarate_test.cc
index 67c78bc91c..31a8523d21 100644
--- a/libs/libvpx/test/datarate_test.cc
+++ b/libs/libvpx/test/datarate_test.cc
@@ -20,7 +20,7 @@ namespace {
 
 class DatarateTestLarge
     : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  public:
   DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {}
 
@@ -30,6 +30,7 @@ class DatarateTestLarge
   virtual void SetUp() {
     InitializeConfig();
     SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
     ResetModel();
   }
 
@@ -42,14 +43,24 @@ class DatarateTestLarge
     duration_ = 0.0;
     denoiser_offon_test_ = 0;
     denoiser_offon_period_ = -1;
+    gf_boost_ = 0;
+    use_roi_ = 0;
   }
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_);
     }
 
+#if CONFIG_VP8_ENCODER
+    if (use_roi_ == 1) {
+      encoder->Control(VP8E_SET_ROI_MAP, &roi_);
+    }
+#endif
+
     if (denoiser_offon_test_) {
       ASSERT_GT(denoiser_offon_period_, 0)
           << "denoiser_offon_period_ is not positive.";
@@ -87,8 +98,8 @@ class DatarateTestLarge
     const bool key_frame =
         (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
     if (!key_frame) {
-      ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-                                          << pkt->data.frame.pts;
+      ASSERT_GE(bits_in_buffer_model_, 0)
+          << "Buffer Underrun at frame " << pkt->data.frame.pts;
     }
 
     const int64_t frame_size_in_bits = pkt->data.frame.sz * 8;
@@ -139,6 +150,10 @@ class DatarateTestLarge
   int denoiser_on_;
   int denoiser_offon_test_;
   int denoiser_offon_period_;
+  int set_cpu_used_;
+  int gf_boost_;
+  int use_roi_;
+  vpx_roi_map_t roi_;
 };
 
 #if CONFIG_TEMPORAL_DENOISING
@@ -156,9 +171,6 @@ TEST_P(DatarateTestLarge, DenoiserLevels) {
     // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
     // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
     // denoiserOnAggressive, and denoiserOnAdaptive.
-    // For the spatial denoiser (if !CONFIG_TEMPORAL_DENOISING), the level j
-    // refers to the blur thresholds: 20, 40, 60 80.
-    // The j = 0 case (denoiser off) is covered in the tests below.
     denoiser_on_ = j;
     cfg_.rc_target_bitrate = 300;
     ResetModel();
@@ -166,7 +178,7 @@ TEST_P(DatarateTestLarge, DenoiserLevels) {
     ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
         << " The datarate for the file exceeds the target!";
 
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
         << " The datarate for the file missed the target!";
   }
 }
@@ -190,7 +202,7 @@ TEST_P(DatarateTestLarge, DenoiserOffOn) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
       << " The datarate for the file exceeds the target!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
       << " The datarate for the file missed the target!";
 }
 #endif  // CONFIG_TEMPORAL_DENOISING
@@ -221,8 +233,7 @@ TEST_P(DatarateTestLarge, BasicBufferModel) {
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
         << " The datarate for the file exceeds the target!";
-
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
         << " The datarate for the file missed the target!";
   }
 }
@@ -256,15 +267,142 @@ TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
   }
 }
 
-// Disabled for tsan, see:
-// https://bugs.chromium.org/p/webm/issues/detail?id=1049
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#define BUILDING_WITH_TSAN
-#endif
-#endif
-#ifndef BUILDING_WITH_TSAN
 TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  cfg_.rc_target_bitrate = 200;
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+}
+
+class DatarateTestRealTime : public DatarateTestLarge {
+ public:
+  virtual ~DatarateTestRealTime() {}
+};
+
+#if CONFIG_TEMPORAL_DENOISING
+// Check basic datarate targeting, for a single bitrate, but loop over the
+// various denoiser settings.
+TEST_P(DatarateTestRealTime, DenoiserLevels) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  for (int j = 1; j < 5; ++j) {
+    // Run over the denoiser levels.
+    // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
+    // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
+    // denoiserOnAggressive, and denoiserOnAdaptive.
+    denoiser_on_ = j;
+    cfg_.rc_target_bitrate = 300;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+        << " The datarate for the file exceeds the target!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+        << " The datarate for the file missed the target!";
+  }
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestRealTime, DenoiserOffOn) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // The denoiser is off by default.
+  denoiser_on_ = 0;
+  // Set the offon test flag.
+  denoiser_offon_test_ = 1;
+  denoiser_offon_period_ = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+}
+#endif  // CONFIG_TEMPORAL_DENOISING
+
+TEST_P(DatarateTestRealTime, BasicBufferModel) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  // 2 pass cbr datarate control has a bug hidden by the small # of
+  // frames selected in this encode. The problem is that even if the buffer is
+  // negative we produce a keyframe on a cutscene, ignoring datarate
+  // constraints
+  // TODO(jimbankoski): Fix when issue
+  // http://bugs.chromium.org/p/webm/issues/detail?id=495 is addressed.
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+
+  // There is an issue for low bitrates in real-time mode, where the
+  // effective_datarate slightly overshoots the target bitrate.
+  // This is same the issue as noted above (#495).
+  // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100),
+  // when the issue is resolved.
+  for (int i = 100; i <= 700; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+        << " The datarate for the file exceeds the target!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+        << " The datarate for the file missed the target!";
+  }
+}
+
+TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_max_quantizer = 36;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.kf_mode = VPX_KF_DISABLED;
+
+  const int frame_count = 40;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, frame_count);
+
+  // Check that the first dropped frame gets earlier and earlier
+  // as the drop frame threshold is increased.
+
+  const int kDropFrameThreshTestStep = 30;
+  vpx_codec_pts_t last_drop = frame_count;
+  for (int i = 1; i < 91; i += kDropFrameThreshTestStep) {
+    cfg_.rc_dropframe_thresh = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_LE(first_drop_, last_drop)
+        << " The first dropped frame for drop_thresh " << i
+        << " > first dropped frame for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    last_drop = first_drop_;
+  }
+}
+
+TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
   denoiser_on_ = 0;
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_dropframe_thresh = 30;
@@ -281,10 +419,93 @@ TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
   ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
       << " The datarate for the file exceeds the target!";
 
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+}
+
+TEST_P(DatarateTestRealTime, RegionOfInterest) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  // Encode using multiple threads.
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 450;
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+
+  ResetModel();
+
+  // Set ROI parameters
+  use_roi_ = 1;
+  memset(&roi_, 0, sizeof(roi_));
+
+  roi_.rows = (cfg_.g_h + 15) / 16;
+  roi_.cols = (cfg_.g_w + 15) / 16;
+
+  roi_.delta_q[0] = 0;
+  roi_.delta_q[1] = -20;
+  roi_.delta_q[2] = 0;
+  roi_.delta_q[3] = 0;
+
+  roi_.delta_lf[0] = 0;
+  roi_.delta_lf[1] = -20;
+  roi_.delta_lf[2] = 0;
+  roi_.delta_lf[3] = 0;
+
+  roi_.static_threshold[0] = 0;
+  roi_.static_threshold[1] = 1000;
+  roi_.static_threshold[2] = 0;
+  roi_.static_threshold[3] = 0;
+
+  // Use 2 states: 1 is center square, 0 is the rest.
+  roi_.roi_map =
+      (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map));
+  for (unsigned int i = 0; i < roi_.rows; ++i) {
+    for (unsigned int j = 0; j < roi_.cols; ++j) {
+      if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
+          j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
+        roi_.roi_map[i * roi_.cols + j] = 1;
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+
+  free(roi_.roi_map);
+}
+
+TEST_P(DatarateTestRealTime, GFBoost) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_error_resilient = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // Apply a gf boost.
+  gf_boost_ = 50;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
       << " The datarate for the file missed the target!";
 }
-#endif
 
 class DatarateTestVP9Large
     : public ::libvpx_test::EncoderTest,
@@ -317,6 +538,7 @@ class DatarateTestVP9Large
     }
     denoiser_offon_test_ = 0;
     denoiser_offon_period_ = -1;
+    frame_parallel_decoding_mode_ = 1;
   }
 
   //
@@ -331,8 +553,8 @@ class DatarateTestVP9Large
   //      2           6
   // 0          4            ....
   // LAST is always update on base/layer 0, GOLDEN is updated on layer 1.
-  // For this 3 layer example, the 2nd enhancement layer (layer 2) does not
-  // update any reference frames.
+  // For this 3 layer example, the 2nd enhancement layer (layer 2) updates
+  // the altref frame.
   int SetFrameFlags(int frame_num, int num_temp_layers) {
     int frame_flags = 0;
     if (num_temp_layers == 2) {
@@ -354,9 +576,8 @@ class DatarateTestVP9Large
         // Layer 1: predict from L, G, ARF; update G.
         frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
       } else if ((frame_num - 1) % 2 == 0) {
-        // Layer 2: predict from L, G, ARF; update none.
-        frame_flags =
-            VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
+        // Layer 2: predict from L, G, ARF; update ARF.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
       }
     }
     return frame_flags;
@@ -396,6 +617,9 @@ class DatarateTestVP9Large
     }
 
     encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+    encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1));
+    encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
+                     frame_parallel_decoding_mode_);
 
     if (cfg_.ts_number_layers > 1) {
       if (video->frame() == 0) {
@@ -434,8 +658,8 @@ class DatarateTestVP9Large
         duration * timebase_ * cfg_.rc_target_bitrate * 1000);
 
     // Buffer should not go negative.
-    ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-                                        << pkt->data.frame.pts;
+    ASSERT_GE(bits_in_buffer_model_, 0)
+        << "Buffer Underrun at frame " << pkt->data.frame.pts;
 
     const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
 
@@ -476,6 +700,7 @@ class DatarateTestVP9Large
   int denoiser_on_;
   int denoiser_offon_test_;
   int denoiser_offon_period_;
+  int frame_parallel_decoding_mode_;
 };
 
 // Check basic rate targeting for VBR mode with 0 lag.
@@ -494,7 +719,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagZero) {
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
         << " The datarate for the file is greater than target by too much!";
   }
 }
@@ -521,7 +746,37 @@ TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZero) {
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
+        << " The datarate for the file is greater than target by too much!";
+  }
+}
+
+// Check basic rate targeting for VBR mode with non-zero lag, with
+// frame_parallel_decoding_mode off. This enables the adapt_coeff/mode/mv probs
+// since error_resilience is off.
+TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZeroFrameParDecOff) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  // For non-zero lag, rate control will work (be within bounds) for
+  // real-time mode.
+  if (deadline_ == VPX_DL_REALTIME) {
+    cfg_.g_lag_in_frames = 15;
+  } else {
+    cfg_.g_lag_in_frames = 0;
+  }
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  for (int i = 400; i <= 800; i += 400) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    frame_parallel_decoding_mode_ = 0;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
         << " The datarate for the file is greater than target by too much!";
   }
 }
@@ -550,6 +805,33 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
   }
 }
 
+// Check basic rate targeting for CBR mode, with frame_parallel_decoding_mode
+// off( and error_resilience off).
+TEST_P(DatarateTestVP9Large, BasicRateTargetingFrameParDecOff) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  for (int i = 150; i < 800; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    frame_parallel_decoding_mode_ = 0;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+        << " The datarate for the file is greater than target by too much!";
+  }
+}
+
 // Check basic rate targeting for CBR mode, with 2 threads and dropped frames.
 TEST_P(DatarateTestVP9Large, BasicRateTargetingDropFramesMultiThreads) {
   cfg_.rc_buf_initial_sz = 500;
@@ -594,7 +876,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting444) {
     ResetModel();
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
-              effective_datarate_[0] * 0.85)
+              effective_datarate_[0] * 0.80)
         << " The datarate for the file exceeds the target by too much!";
     ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
               effective_datarate_[0] * 1.15)
@@ -627,26 +909,29 @@ TEST_P(DatarateTestVP9Large, ChangingDropFrameThresh) {
                                        30, 1, 0, 140);
 
   const int kDropFrameThreshTestStep = 30;
-  vpx_codec_pts_t last_drop = 140;
-  int last_num_drops = 0;
-  for (int i = 10; i < 100; i += kDropFrameThreshTestStep) {
-    cfg_.rc_dropframe_thresh = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-        << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-        << " The datarate for the file is greater than target by too much!";
-    ASSERT_LE(first_drop_, last_drop)
-        << " The first dropped frame for drop_thresh " << i
-        << " > first dropped frame for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    ASSERT_GE(num_drops_, last_num_drops * 0.85)
-        << " The number of dropped frames for drop_thresh " << i
-        << " < number of dropped frames for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    last_drop = first_drop_;
-    last_num_drops = num_drops_;
+  for (int j = 50; j <= 150; j += 100) {
+    cfg_.rc_target_bitrate = j;
+    vpx_codec_pts_t last_drop = 140;
+    int last_num_drops = 0;
+    for (int i = 10; i < 100; i += kDropFrameThreshTestStep) {
+      cfg_.rc_dropframe_thresh = i;
+      ResetModel();
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+          << " The datarate for the file is greater than target by too much!";
+      ASSERT_LE(first_drop_, last_drop)
+          << " The first dropped frame for drop_thresh " << i
+          << " > first dropped frame for drop_thresh "
+          << i - kDropFrameThreshTestStep;
+      ASSERT_GE(num_drops_, last_num_drops * 0.85)
+          << " The number of dropped frames for drop_thresh " << i
+          << " < number of dropped frames for drop_thresh "
+          << i - kDropFrameThreshTestStep;
+      last_drop = first_drop_;
+      last_num_drops = num_drops_;
+    }
   }
 }
 
@@ -789,8 +1074,13 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) {
 }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
+class DatarateTestVP9LargeDenoiser : public DatarateTestVP9Large {
+ public:
+  virtual ~DatarateTestVP9LargeDenoiser() {}
+};
+
 // Check basic datarate targeting, for a single bitrate, when denoiser is on.
-TEST_P(DatarateTestVP9Large, DenoiserLevels) {
+TEST_P(DatarateTestVP9LargeDenoiser, LowNoise) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -817,9 +1107,67 @@ TEST_P(DatarateTestVP9Large, DenoiserLevels) {
       << " The datarate for the file is greater than target by too much!";
 }
 
+// Check basic datarate targeting, for a single bitrate, when denoiser is on,
+// for clip with high noise level. Use 2 threads.
+TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: kDenoiserOnYOnly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  // Turn on the denoiser.
+  denoiser_on_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is on,
+// for 1280x720 clip with 4 threads.
+TEST_P(DatarateTestVP9LargeDenoiser, 4threads) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_threads = 4;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  // Turn on the denoiser.
+  denoiser_on_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.29)
+      << " The datarate for the file is greater than target by too much!";
+}
+
 // Check basic datarate targeting, for a single bitrate, when denoiser is off
 // and on.
-TEST_P(DatarateTestVP9Large, DenoiserOffOn) {
+TEST_P(DatarateTestVP9LargeDenoiser, DenoiserOffOn) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -868,13 +1216,17 @@ class DatarateOnePassCbrSvc
   }
   virtual void ResetModel() {
     last_pts_ = 0;
-    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
-    frame_number_ = 0;
-    first_drop_ = 0;
-    bits_total_ = 0;
     duration_ = 0.0;
     mismatch_psnr_ = 0.0;
     mismatch_nframes_ = 0;
+    denoiser_on_ = 0;
+    tune_content_ = 0;
+    base_speed_setting_ = 5;
+    spatial_layer_id_ = 0;
+    temporal_layer_id_ = 0;
+    memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_));
+    memset(bits_total_, 0, sizeof(bits_total_));
+    memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_));
   }
   virtual void BeginPassHook(unsigned int /*pass*/) {}
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
@@ -885,45 +1237,114 @@ class DatarateOnePassCbrSvc
         svc_params_.max_quantizers[i] = 63;
         svc_params_.min_quantizers[i] = 0;
       }
-      svc_params_.speed_per_layer[0] = 5;
+      svc_params_.speed_per_layer[0] = base_speed_setting_;
       for (i = 1; i < VPX_SS_MAX_LAYERS; ++i) {
-        svc_params_.speed_per_layer[i] = 7;
+        svc_params_.speed_per_layer[i] = speed_setting_;
       }
+
+      encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
       encoder->Control(VP9E_SET_SVC, 1);
       encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
       encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
       encoder->Control(VP9E_SET_TILE_COLUMNS, 0);
       encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300);
       encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1));
+      encoder->Control(VP9E_SET_ROW_MT, 1);
+      encoder->Control(VP8E_SET_STATIC_THRESHOLD, 1);
+      encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
     }
     const vpx_rational_t tb = video->timebase();
     timebase_ = static_cast<double>(tb.num) / tb.den;
     duration_ = 0;
   }
+
+  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+    vpx_svc_layer_id_t layer_id;
+    encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
+    spatial_layer_id_ = layer_id.spatial_layer_id;
+    temporal_layer_id_ = layer_id.temporal_layer_id;
+    // Update buffer with per-layer target frame bandwidth, this is done
+    // for every frame passed to the encoder (encoded or dropped).
+    // For temporal layers, update the cumulative buffer level.
+    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
+        const int layer = sl * number_temporal_layers_ + tl;
+        bits_in_buffer_model_[layer] +=
+            static_cast<int64_t>(layer_target_avg_bandwidth_[layer]);
+      }
+    }
+  }
+
+  vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
+                                         uint32_t sizes[8], int *count) {
+    uint8_t marker;
+    marker = *(data + data_sz - 1);
+    *count = 0;
+    if ((marker & 0xe0) == 0xc0) {
+      const uint32_t frames = (marker & 0x7) + 1;
+      const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+      const size_t index_sz = 2 + mag * frames;
+      // This chunk is marked as having a superframe index but doesn't have
+      // enough data for it, thus it's an invalid superframe index.
+      if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME;
+      {
+        const uint8_t marker2 = *(data + data_sz - index_sz);
+        // This chunk is marked as having a superframe index but doesn't have
+        // the matching marker byte at the front of the index therefore it's an
+        // invalid chunk.
+        if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME;
+      }
+      {
+        uint32_t i, j;
+        const uint8_t *x = &data[data_sz - index_sz + 1];
+        for (i = 0; i < frames; ++i) {
+          uint32_t this_sz = 0;
+
+          for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
+          sizes[i] = this_sz;
+        }
+        *count = frames;
+      }
+    }
+    return VPX_CODEC_OK;
+  }
+
   virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
-    if (last_pts_ == 0) duration = 1;
-    bits_in_buffer_model_ += static_cast<int64_t>(
-        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+    uint32_t sizes[8] = { 0 };
+    int count = 0;
+    last_pts_ = pkt->data.frame.pts;
     const bool key_frame =
         (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
-    if (!key_frame) {
-      ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-                                          << pkt->data.frame.pts;
+    parse_superframe_index(static_cast<const uint8_t *>(pkt->data.frame.buf),
+                           pkt->data.frame.sz, sizes, &count);
+    ASSERT_EQ(count, number_spatial_layers_);
+    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      sizes[sl] = sizes[sl] << 3;
+      // Update the total encoded bits per layer.
+      // For temporal layers, update the cumulative encoded bits per layer.
+      for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
+        const int layer = sl * number_temporal_layers_ + tl;
+        bits_total_[layer] += static_cast<int64_t>(sizes[sl]);
+        // Update the per-layer buffer level with the encoded frame size.
+        bits_in_buffer_model_[layer] -= static_cast<int64_t>(sizes[sl]);
+        // There should be no buffer underrun, except on the base
+        // temporal layer, since there may be key frames there.
+        if (!key_frame && tl > 0) {
+          ASSERT_GE(bits_in_buffer_model_[layer], 0)
+              << "Buffer Underrun at frame " << pkt->data.frame.pts;
+        }
+      }
     }
-    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
-    bits_in_buffer_model_ -= frame_size_in_bits;
-    bits_total_ += frame_size_in_bits;
-    if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1;
-    last_pts_ = pkt->data.frame.pts;
-    bits_in_last_frame_ = frame_size_in_bits;
-    ++frame_number_;
   }
+
   virtual void EndPassHook(void) {
-    if (bits_total_) {
-      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
-      duration_ = (last_pts_ + 1) * timebase_;
-      file_datarate_ = file_size_in_kb / duration_;
+    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      for (int tl = 0; tl < number_temporal_layers_; ++tl) {
+        const int layer = sl * number_temporal_layers_ + tl;
+        const double file_size_in_kb = bits_total_[layer] / 1000.;
+        duration_ = (last_pts_ + 1) * timebase_;
+        file_datarate_[layer] = file_size_in_kb / duration_;
+      }
     }
   }
 
@@ -936,26 +1357,35 @@ class DatarateOnePassCbrSvc
   unsigned int GetMismatchFrames() { return mismatch_nframes_; }
 
   vpx_codec_pts_t last_pts_;
-  int64_t bits_in_buffer_model_;
+  int64_t bits_in_buffer_model_[VPX_MAX_LAYERS];
   double timebase_;
-  int frame_number_;
-  vpx_codec_pts_t first_drop_;
-  int64_t bits_total_;
+  int64_t bits_total_[VPX_MAX_LAYERS];
   double duration_;
-  double file_datarate_;
+  double file_datarate_[VPX_MAX_LAYERS];
   size_t bits_in_last_frame_;
   vpx_svc_extra_cfg_t svc_params_;
   int speed_setting_;
   double mismatch_psnr_;
   int mismatch_nframes_;
+  int denoiser_on_;
+  int tune_content_;
+  int base_speed_setting_;
+  int spatial_layer_id_;
+  int temporal_layer_id_;
+  int number_spatial_layers_;
+  int number_temporal_layers_;
+  int layer_target_avg_bandwidth_[VPX_MAX_LAYERS];
 };
 static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
                                   const vpx_svc_extra_cfg_t *svc_params,
                                   int spatial_layers, int temporal_layers,
-                                  int temporal_layering_mode) {
+                                  int temporal_layering_mode,
+                                  int *layer_target_avg_bandwidth,
+                                  int64_t *bits_in_buffer_model) {
   int sl, spatial_layer_target;
   float total = 0;
   float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
+  float framerate = 30.0;
   for (sl = 0; sl < spatial_layers; ++sl) {
     if (svc_params->scaling_factor_den[sl] > 0) {
       alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] * 1.0 /
@@ -975,13 +1405,47 @@ static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
     } else if (temporal_layering_mode == 2) {
       enc_cfg->layer_target_bitrate[index] = spatial_layer_target * 2 / 3;
       enc_cfg->layer_target_bitrate[index + 1] = spatial_layer_target;
+    } else if (temporal_layering_mode <= 1) {
+      enc_cfg->layer_target_bitrate[index] = spatial_layer_target;
+    }
+  }
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    for (int tl = 0; tl < temporal_layers; ++tl) {
+      const int layer = sl * temporal_layers + tl;
+      float layer_framerate = framerate;
+      if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2;
+      if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4;
+      if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2;
+      layer_target_avg_bandwidth[layer] = static_cast<int>(
+          enc_cfg->layer_target_bitrate[layer] * 1000.0 / layer_framerate);
+      bits_in_buffer_model[layer] =
+          enc_cfg->layer_target_bitrate[layer] * enc_cfg->rc_buf_initial_sz;
     }
   }
 }
 
-// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
-// 3 temporal layers. Run CIF clip with 1 thread.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers) {
+static void CheckLayerRateTargeting(vpx_codec_enc_cfg_t *const cfg,
+                                    int number_spatial_layers,
+                                    int number_temporal_layers,
+                                    double *file_datarate,
+                                    double thresh_overshoot,
+                                    double thresh_undershoot) {
+  for (int sl = 0; sl < number_spatial_layers; ++sl)
+    for (int tl = 0; tl < number_temporal_layers; ++tl) {
+      const int layer = sl * number_temporal_layers + tl;
+      ASSERT_GE(cfg->layer_target_bitrate[layer],
+                file_datarate[layer] * thresh_overshoot)
+          << " The datarate for the file exceeds the target by too much!";
+      ASSERT_LE(cfg->layer_target_bitrate[layer],
+                file_datarate[layer] * thresh_undershoot)
+          << " The datarate for the file is lower than the target by too much!";
+    }
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1
+// temporal layer, with screen content mode on and same speed setting for all
+// layers.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TLScreenContent1) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -990,40 +1454,142 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers) {
   cfg_.rc_end_usage = VPX_CBR;
   cfg_.g_lag_in_frames = 0;
   cfg_.ss_number_layers = 2;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.ts_number_layers = 1;
+  cfg_.ts_rate_decimator[0] = 1;
   cfg_.g_error_resilient = 1;
   cfg_.g_threads = 1;
-  cfg_.temporal_layering_mode = 3;
+  cfg_.temporal_layering_mode = 0;
   svc_params_.scaling_factor_num[0] = 144;
   svc_params_.scaling_factor_den[0] = 288;
   svc_params_.scaling_factor_num[1] = 288;
   svc_params_.scaling_factor_den[1] = 288;
   cfg_.rc_dropframe_thresh = 10;
   cfg_.kf_max_dist = 9999;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
-  // TODO(wonkap/marpan): Check that effective_datarate for each layer hits the
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  tune_content_ = 1;
+  base_speed_setting_ = speed_setting_;
+  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
+// 3 temporal layers. Run CIF clip with 1 thread.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 144;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 288;
+  svc_params_.scaling_factor_den[1] = 288;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  // TODO(marpan): Check that effective_datarate for each layer hits the
   // layer target_bitrate.
   for (int i = 200; i <= 800; i += 200) {
     cfg_.rc_target_bitrate = i;
     ResetModel();
     assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
-        << " The datarate for the file is lower than the target by too much!";
-    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                            number_temporal_layers_, file_datarate_, 0.78,
+                            1.15);
+#if CONFIG_VP9_DECODER
+    // Number of temporal layers > 1, so half of the frames in this SVC pattern
+    // will be non-reference frame and hence encoder will avoid loopfilter.
+    // Since frame dropper is off, we can expect 200 (half of the sequence)
+    // mismatched frames.
+    EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
+#endif
+  }
+}
+
+// Check basic rate targeting for 1 pass CBR SVC with denoising.
+// 2 spatial layers and 3 temporal layer. Run HD clip with 2 threads.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLDenoiserOn) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 2;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 144;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 288;
+  svc_params_.scaling_factor_den[1] = 288;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  // TODO(marpan): Check that effective_datarate for each layer hits the
+  // layer target_bitrate.
+  // For SVC, noise_sen = 1 means denoising only the top spatial layer
+  // noise_sen = 2 means denoising the two top spatial layers.
+  for (int noise_sen = 1; noise_sen <= 2; noise_sen++) {
+    for (int i = 600; i <= 1000; i += 200) {
+      cfg_.rc_target_bitrate = i;
+      ResetModel();
+      denoiser_on_ = noise_sen;
+      assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+                            cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                            layer_target_avg_bandwidth_, bits_in_buffer_model_);
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                              number_temporal_layers_, file_datarate_, 0.78,
+                              1.15);
+#if CONFIG_VP9_DECODER
+      // Number of temporal layers > 1, so half of the frames in this SVC
+      // pattern
+      // will be non-reference frame and hence encoder will avoid loopfilter.
+      // Since frame dropper is off, we can expect 200 (half of the sequence)
+      // mismatched frames.
+      EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
+#endif
+    }
   }
 }
 
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
 // temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersSmallKf) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLSmallKf) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -1044,28 +1610,29 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersSmallKf) {
   svc_params_.scaling_factor_num[1] = 288;
   svc_params_.scaling_factor_den[1] = 288;
   cfg_.rc_dropframe_thresh = 10;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
   cfg_.rc_target_bitrate = 400;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
   // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
   // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
   for (int j = 64; j <= 67; j++) {
     cfg_.kf_max_dist = j;
     ResetModel();
     assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
-        << " The datarate for the file is lower than the target by too much!";
-    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                            number_temporal_layers_, file_datarate_, 0.78,
+                            1.15);
   }
 }
 
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
 // 3 temporal layers. Run HD clip with 4 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers4threads) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4Threads) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -1085,25 +1652,31 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers4threads) {
   svc_params_.scaling_factor_den[0] = 288;
   svc_params_.scaling_factor_num[1] = 288;
   svc_params_.scaling_factor_den[1] = 288;
-  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_dropframe_thresh = 0;
   cfg_.kf_max_dist = 9999;
-  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720, 30,
-                                       1, 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 800;
   ResetModel();
   assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
-      << " The datarate for the file exceeds the target by too much!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
-      << " The datarate for the file is lower than the target by too much!";
-  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
+#if CONFIG_VP9_DECODER
+  // Number of temporal layers > 1, so half of the frames in this SVC pattern
+  // will be non-reference frame and hence encoder will avoid loopfilter.
+  // Since frame dropper is off, we can expect 30 (half of the sequence)
+  // mismatched frames.
+  EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames());
+#endif
 }
 
 // Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
 // 3 temporal layers. Run CIF clip with 1 thread.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -1125,25 +1698,32 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers) {
   svc_params_.scaling_factor_den[1] = 288;
   svc_params_.scaling_factor_num[2] = 288;
   svc_params_.scaling_factor_den[2] = 288;
-  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_dropframe_thresh = 0;
   cfg_.kf_max_dist = 9999;
-  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720, 30,
-                                       1, 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
   cfg_.rc_target_bitrate = 800;
   ResetModel();
   assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
-      << " The datarate for the file exceeds the target by too much!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
-      << " The datarate for the file is lower than the target by too much!";
-  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
+#if CONFIG_VP9_DECODER
+  // Number of temporal layers > 1, so half of the frames in this SVC pattern
+  // will be non-reference frame and hence encoder will avoid loopfilter.
+  // Since frame dropper is off, we can expect 200 (half of the sequence)
+  // mismatched frames.
+  EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
+#endif
 }
 
 // Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
 // temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayersSmallKf) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -1166,28 +1746,29 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayersSmallKf) {
   svc_params_.scaling_factor_num[2] = 288;
   svc_params_.scaling_factor_den[2] = 288;
   cfg_.rc_dropframe_thresh = 10;
-  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720, 30,
-                                       1, 0, 300);
   cfg_.rc_target_bitrate = 800;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
   // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
   // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
   for (int j = 32; j <= 35; j++) {
     cfg_.kf_max_dist = j;
     ResetModel();
     assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.30)
-        << " The datarate for the file is lower than the target by too much!";
-    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                            number_temporal_layers_, file_datarate_, 0.78,
+                            1.15);
   }
 }
 
 // Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
 // 3 temporal layers. Run HD clip with 4 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers4threads) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL4threads) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -1209,27 +1790,86 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers4threads) {
   svc_params_.scaling_factor_den[1] = 288;
   svc_params_.scaling_factor_num[2] = 288;
   svc_params_.scaling_factor_den[2] = 288;
-  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_dropframe_thresh = 0;
   cfg_.kf_max_dist = 9999;
-  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720, 30,
-                                       1, 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 800;
   ResetModel();
   assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
-      << " The datarate for the file exceeds the target by too much!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
-      << " The datarate for the file is lower than the target by too much!";
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
+#if CONFIG_VP9_DECODER
+  // Number of temporal layers > 1, so half of the frames in this SVC pattern
+  // will be non-reference frame and hence encoder will avoid loopfilter.
+  // Since frame dropper is off, we can expect 30 (half of the sequence)
+  // mismatched frames.
+  EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames());
+#endif
+}
+
+// Run SVC encoder for 1 temporal layer, 2 spatial layers, with spatial
+// downscale 5x5.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TL5x5MultipleRuns) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 1;
+  cfg_.ts_rate_decimator[0] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 3;
+  cfg_.temporal_layering_mode = 0;
+  svc_params_.scaling_factor_num[0] = 256;
+  svc_params_.scaling_factor_den[0] = 1280;
+  svc_params_.scaling_factor_num[1] = 1280;
+  svc_params_.scaling_factor_den[1] = 1280;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 999999;
+  cfg_.kf_min_dist = 0;
+  cfg_.ss_target_bitrate[0] = 300;
+  cfg_.ss_target_bitrate[1] = 1400;
+  cfg_.layer_target_bitrate[0] = 300;
+  cfg_.layer_target_bitrate[1] = 1400;
+  cfg_.rc_target_bitrate = 1700;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ResetModel();
+  layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30;
+  bits_in_buffer_model_[0] =
+      cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz;
+  layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30;
+  bits_in_buffer_model_[1] =
+      cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
   EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 }
 
-VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES);
+VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES,
+                          ::testing::Values(0));
+VP8_INSTANTIATE_TEST_CASE(DatarateTestRealTime,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Values(-6, -12));
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
                           ::testing::Values(::libvpx_test::kOnePassGood,
                                             ::libvpx_test::kRealTime),
                           ::testing::Range(2, 9));
+#if CONFIG_VP9_TEMPORAL_DENOISING
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Range(5, 9));
+#endif
 VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvc,
                           ::testing::Values(::libvpx_test::kRealTime),
                           ::testing::Range(5, 9));
diff --git a/libs/libvpx/test/dct16x16_test.cc b/libs/libvpx/test/dct16x16_test.cc
index f9745ed81f..ce0bd37b3d 100644
--- a/libs/libvpx/test/dct16x16_test.cc
+++ b/libs/libvpx/test/dct16x16_test.cc
@@ -255,11 +255,11 @@ void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_c(in, out, stride, 10);
+  vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_c(in, out, stride, 12);
+  vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 
 void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
@@ -273,36 +273,36 @@ void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
 }
 
 void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 10);
+  vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
 }
 
 void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
+  vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
 }
 
 #if HAVE_SSE2
 void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_c(in, out, stride, 10);
+  vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_c(in, out, stride, 12);
+  vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 
 void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 
 void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 #endif  // HAVE_SSE2
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -353,7 +353,7 @@ class Trans16x16TestBase {
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
         ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
       }
 
@@ -475,10 +475,10 @@ class Trans16x16TestBase {
         ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_,
+        inv_txfm_ref(output_ref_block, CAST_TO_BYTEPTR(ref16), pitch_,
                      tx_type_);
         ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(output_ref_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(output_ref_block, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
       }
       if (bit_depth_ == VPX_BITS_8) {
@@ -530,8 +530,7 @@ class Trans16x16TestBase {
         ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), 16));
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), 16));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
 
@@ -543,8 +542,8 @@ class Trans16x16TestBase {
         const uint32_t diff = dst[j] - src[j];
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         const uint32_t error = diff * diff;
-        EXPECT_GE(1u, error) << "Error: 16x16 IDCT has error " << error
-                             << " at index " << j;
+        EXPECT_GE(1u, error)
+            << "Error: 16x16 IDCT has error " << error << " at index " << j;
       }
     }
   }
@@ -585,9 +584,9 @@ class Trans16x16TestBase {
         ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
       } else {
 #if CONFIG_VP9_HIGHBITDEPTH
-        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
         ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
 
@@ -745,66 +744,6 @@ TEST_P(InvTrans16x16DCT, CompareReference) {
   CompareInvReference(ref_txfm_, thresh_);
 }
 
-class PartialTrans16x16Test : public ::testing::TestWithParam<
-                                  std::tr1::tuple<FdctFunc, vpx_bit_depth_t> > {
- public:
-  virtual ~PartialTrans16x16Test() {}
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    bit_depth_ = GET_PARAM(1);
-  }
-
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  vpx_bit_depth_t bit_depth_;
-  FdctFunc fwd_txfm_;
-};
-
-TEST_P(PartialTrans16x16Test, Extremes) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  const int minval = -maxval;
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ((maxval * kNumCoeffs) >> 1, output[0]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ((minval * kNumCoeffs) >> 1, output[0]);
-}
-
-TEST_P(PartialTrans16x16Test, Random) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  int sum = 0;
-  for (int i = 0; i < kNumCoeffs; ++i) {
-    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
-    input[i] = val;
-    sum += val;
-  }
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ(sum >> 1, output[0]);
-}
-
 using std::tr1::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -837,11 +776,6 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    C, PartialTrans16x16Test,
-    ::testing::Values(make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16HT,
@@ -850,17 +784,14 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(C, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_c,
-                                                     VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans16x16DCT,
-    ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_neon,
-                                 0, VPX_BITS_8)));
-#endif
+    ::testing::Values(make_tuple(&vpx_fdct16x16_neon,
+                                 &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8)));
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@@ -877,9 +808,6 @@ INSTANTIATE_TEST_CASE_P(
                                  2, VPX_BITS_8),
                       make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2,
                                  3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -914,9 +842,6 @@ INSTANTIATE_TEST_CASE_P(
                                  &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
                       make_tuple(&idct16x16_12, &idct16x16_256_add_12_sse2,
                                  3167, VPX_BITS_12)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -932,8 +857,12 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
                    VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(MSA, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_msa,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(VSX, Trans16x16DCT,
+                        ::testing::Values(make_tuple(&vpx_fdct16x16_c,
+                                                     &vpx_idct16x16_256_add_vsx,
+                                                     0, VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/libs/libvpx/test/dct32x32_test.cc b/libs/libvpx/test/dct32x32_test.cc
index a168e690ee..a95ff97328 100644
--- a/libs/libvpx/test/dct32x32_test.cc
+++ b/libs/libvpx/test/dct32x32_test.cc
@@ -71,11 +71,11 @@ typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 10);
+  vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 12);
+  vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -137,7 +137,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
 #if CONFIG_VP9_HIGHBITDEPTH
     } else {
       ASM_REGISTER_STATE_CHECK(
-          inv_txfm_(test_temp_block, CONVERT_TO_BYTEPTR(dst16), 32));
+          inv_txfm_(test_temp_block, CAST_TO_BYTEPTR(dst16), 32));
 #endif
     }
 
@@ -275,7 +275,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
       ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
 #if CONFIG_VP9_HIGHBITDEPTH
     } else {
-      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32));
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CAST_TO_BYTEPTR(dst16), 32));
 #endif
     }
     for (int j = 0; j < kNumCoeffs; ++j) {
@@ -292,67 +292,6 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
   }
 }
 
-class PartialTrans32x32Test
-    : public ::testing::TestWithParam<
-          std::tr1::tuple<FwdTxfmFunc, vpx_bit_depth_t> > {
- public:
-  virtual ~PartialTrans32x32Test() {}
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    bit_depth_ = GET_PARAM(1);
-  }
-
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  vpx_bit_depth_t bit_depth_;
-  FwdTxfmFunc fwd_txfm_;
-};
-
-TEST_P(PartialTrans32x32Test, Extremes) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  const int minval = -maxval;
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ((maxval * kNumCoeffs) >> 3, output[0]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ((minval * kNumCoeffs) >> 3, output[0]);
-}
-
-TEST_P(PartialTrans32x32Test, Random) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  int sum = 0;
-  for (int i = 0; i < kNumCoeffs; ++i) {
-    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
-    input[i] = val;
-    sum += val;
-  }
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ(sum >> 3, output[0]);
-}
-
 using std::tr1::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -366,11 +305,6 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8),
         make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1,
                    VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    C, PartialTrans32x32Test,
-    ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans32x32Test,
@@ -378,19 +312,16 @@ INSTANTIATE_TEST_CASE_P(
                                  VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c,
                                  1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(C, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_c,
-                                                     VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans32x32Test,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_neon,
-                                 0, VPX_BITS_8),
-                      make_tuple(&vpx_fdct32x32_rd_c,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_neon,
+                                 &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct32x32_rd_neon,
                                  &vpx_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@@ -399,9 +330,6 @@ INSTANTIATE_TEST_CASE_P(
                                  &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_sse2,
                                  &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -418,9 +346,6 @@ INSTANTIATE_TEST_CASE_P(
                    VPX_BITS_8),
         make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_c, 1,
                    VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -439,8 +364,14 @@ INSTANTIATE_TEST_CASE_P(
                                  &vpx_idct32x32_1024_add_msa, 0, VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_msa,
                                  &vpx_idct32x32_1024_add_msa, 1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(MSA, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    VSX, Trans32x32Test,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_vsx,
+                                 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct32x32_rd_c,
+                                 &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/libs/libvpx/test/dct_partial_test.cc b/libs/libvpx/test/dct_partial_test.cc
new file mode 100644
index 0000000000..4d145f5891
--- /dev/null
+++ b/libs/libvpx/test/dct_partial_test.cc
@@ -0,0 +1,169 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <limits>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
+using std::tr1::tuple;
+using std::tr1::make_tuple;
+
+namespace {
+typedef void (*PartialFdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+
+typedef tuple<PartialFdctFunc, int /* size */, vpx_bit_depth_t>
+    PartialFdctParam;
+
+tran_low_t partial_fdct_ref(const Buffer<int16_t> &in, int size) {
+  int64_t sum = 0;
+  for (int y = 0; y < size; ++y) {
+    for (int x = 0; x < size; ++x) {
+      sum += in.TopLeftPixel()[y * in.stride() + x];
+    }
+  }
+
+  switch (size) {
+    case 4: sum *= 2; break;
+    case 8: /*sum = sum;*/ break;
+    case 16: sum >>= 1; break;
+    case 32: sum >>= 3; break;
+  }
+
+  return static_cast<tran_low_t>(sum);
+}
+
+class PartialFdctTest : public ::testing::TestWithParam<PartialFdctParam> {
+ public:
+  PartialFdctTest() {
+    fwd_txfm_ = GET_PARAM(0);
+    size_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int16_t maxvalue =
+        clip_pixel_highbd(std::numeric_limits<int16_t>::max(), bit_depth_);
+    const int16_t minvalue = -maxvalue;
+    Buffer<int16_t> input_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(input_block.Init());
+    Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(output_block.Init());
+
+    for (int i = 0; i < 100; ++i) {
+      if (i == 0) {
+        input_block.Set(maxvalue);
+      } else if (i == 1) {
+        input_block.Set(minvalue);
+      } else {
+        input_block.Set(&rnd, minvalue, maxvalue);
+      }
+
+      ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block.TopLeftPixel(),
+                                         output_block.TopLeftPixel(),
+                                         input_block.stride()));
+
+      EXPECT_EQ(partial_fdct_ref(input_block, size_),
+                output_block.TopLeftPixel()[0]);
+    }
+  }
+
+  PartialFdctFunc fwd_txfm_;
+  vpx_bit_depth_t bit_depth_;
+  int size_;
+};
+
+TEST_P(PartialFdctTest, PartialFdctTest) { RunTest(); }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_10),
+                      make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_10),
+                      make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_10),
+                      make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_sse2, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_sse2, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_sse2, 4, VPX_BITS_8)));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    NEON, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    NEON, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(MSA, PartialFdctTest,
+                        ::testing::Values(make_tuple(&vpx_fdct8x8_1_msa, 8,
+                                                     VPX_BITS_8)));
+#else   // !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    MSA, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_msa, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_msa, 8, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_MSA
+}  // namespace
diff --git a/libs/libvpx/test/dct_test.cc b/libs/libvpx/test/dct_test.cc
new file mode 100644
index 0000000000..addbdfb463
--- /dev/null
+++ b/libs/libvpx/test/dct_test.cc
@@ -0,0 +1,737 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
+using std::tr1::tuple;
+using std::tr1::make_tuple;
+
+namespace {
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+                        int tx_type);
+typedef void (*FhtFuncRef)(const Buffer<int16_t> &in, Buffer<tran_low_t> *out,
+                           int size, int tx_type);
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+
+/* forward transform, inverse transform, size, transform type, bit depth */
+typedef tuple<FdctFunc, IdctFunc, int, int, vpx_bit_depth_t> DctParam;
+typedef tuple<FhtFunc, IhtFunc, int, int, vpx_bit_depth_t> HtParam;
+
+void fdct_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+              int /*tx_type*/) {
+  const int16_t *i = in.TopLeftPixel();
+  const int i_stride = in.stride();
+  tran_low_t *o = out->TopLeftPixel();
+  if (size == 4) {
+    vpx_fdct4x4_c(i, o, i_stride);
+  } else if (size == 8) {
+    vpx_fdct8x8_c(i, o, i_stride);
+  } else if (size == 16) {
+    vpx_fdct16x16_c(i, o, i_stride);
+  } else if (size == 32) {
+    vpx_fdct32x32_c(i, o, i_stride);
+  }
+}
+
+void fht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+             int tx_type) {
+  const int16_t *i = in.TopLeftPixel();
+  const int i_stride = in.stride();
+  tran_low_t *o = out->TopLeftPixel();
+  if (size == 4) {
+    vp9_fht4x4_c(i, o, i_stride, tx_type);
+  } else if (size == 8) {
+    vp9_fht8x8_c(i, o, i_stride, tx_type);
+  } else if (size == 16) {
+    vp9_fht16x16_c(i, o, i_stride, tx_type);
+  }
+}
+
+void fwht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+              int /*tx_type*/) {
+  ASSERT_EQ(size, 4);
+  vp9_fwht4x4_c(in.TopLeftPixel(), out->TopLeftPixel(), in.stride());
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define idctNxN(n, coeffs, bitdepth)                                       \
+  void idct##n##x##n##_##bitdepth(const tran_low_t *in, uint8_t *out,      \
+                                  int stride) {                            \
+    vpx_highbd_idct##n##x##n##_##coeffs##_add_c(in, CAST_TO_SHORTPTR(out), \
+                                                stride, bitdepth);         \
+  }
+
+idctNxN(4, 16, 10);
+idctNxN(4, 16, 12);
+idctNxN(8, 64, 10);
+idctNxN(8, 64, 12);
+idctNxN(16, 256, 10);
+idctNxN(16, 256, 12);
+idctNxN(32, 1024, 10);
+idctNxN(32, 1024, 12);
+
+#define ihtNxN(n, coeffs, bitdepth)                                        \
+  void iht##n##x##n##_##bitdepth(const tran_low_t *in, uint8_t *out,       \
+                                 int stride, int tx_type) {                \
+    vp9_highbd_iht##n##x##n##_##coeffs##_add_c(in, CAST_TO_SHORTPTR(out),  \
+                                               stride, tx_type, bitdepth); \
+  }
+
+ihtNxN(4, 16, 10);
+ihtNxN(4, 16, 12);
+ihtNxN(8, 64, 10);
+ihtNxN(8, 64, 12);
+ihtNxN(16, 256, 10);
+// ihtNxN(16, 256, 12);
+
+void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
+}
+
+void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+class TransTestBase {
+ public:
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  virtual void RunFwdTxfm(const Buffer<int16_t> &in,
+                          Buffer<tran_low_t> *out) = 0;
+
+  virtual void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) = 0;
+
+  void RunAccuracyCheck(int limit) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    Buffer<int16_t> test_input_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(test_input_block.Init());
+    Buffer<tran_low_t> test_temp_block =
+        Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(test_temp_block.Init());
+    Buffer<uint8_t> dst = Buffer<uint8_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(dst.Init());
+    Buffer<uint8_t> src = Buffer<uint8_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(src.Init());
+#if CONFIG_VP9_HIGHBITDEPTH
+    Buffer<uint16_t> dst16 = Buffer<uint16_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(dst16.Init());
+    Buffer<uint16_t> src16 = Buffer<uint16_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(src16.Init());
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+    for (int i = 0; i < count_test_block; ++i) {
+      if (bit_depth_ == 8) {
+        src.Set(&rnd, &ACMRandom::Rand8);
+        dst.Set(&rnd, &ACMRandom::Rand8);
+        // Initialize a test block with input range [-255, 255].
+        for (int h = 0; h < size_; ++h) {
+          for (int w = 0; w < size_; ++w) {
+            test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] =
+                src.TopLeftPixel()[h * src.stride() + w] -
+                dst.TopLeftPixel()[h * dst.stride() + w];
+          }
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        src16.Set(&rnd, 0, max_pixel_value_);
+        dst16.Set(&rnd, 0, max_pixel_value_);
+        for (int h = 0; h < size_; ++h) {
+          for (int w = 0; w < size_; ++w) {
+            test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] =
+                src16.TopLeftPixel()[h * src16.stride() + w] -
+                dst16.TopLeftPixel()[h * dst16.stride() + w];
+          }
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block, &test_temp_block));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, dst.TopLeftPixel()));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16.TopLeftPixel())));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          int diff;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (bit_depth_ != 8) {
+            diff = dst16.TopLeftPixel()[h * dst16.stride() + w] -
+                   src16.TopLeftPixel()[h * src16.stride() + w];
+          } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+            diff = dst.TopLeftPixel()[h * dst.stride() + w] -
+                   src.TopLeftPixel()[h * src.stride() + w];
+#if CONFIG_VP9_HIGHBITDEPTH
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+          const uint32_t error = diff * diff;
+          if (max_error < error) max_error = error;
+          total_error += error;
+        }
+      }
+    }
+
+    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
+        << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit;
+
+    EXPECT_GE(count_test_block * limit, total_error)
+        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
+        << " per block";
+  }
+
+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    Buffer<int16_t> input_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(input_block.Init());
+    Buffer<tran_low_t> output_ref_block = Buffer<tran_low_t>(size_, size_, 0);
+    ASSERT_TRUE(output_ref_block.Init());
+    Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(output_block.Init());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-max_pixel_value_,
+      // max_pixel_value_].
+      input_block.Set(&rnd, -max_pixel_value_, max_pixel_value_);
+
+      fwd_txfm_ref(input_block, &output_ref_block, size_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, &output_block));
+
+      // The minimum quant value is 4.
+      EXPECT_TRUE(output_block.CheckValues(output_ref_block));
+      if (::testing::Test::HasFailure()) {
+        printf("Size: %d Transform type: %d\n", size_, tx_type_);
+        output_block.PrintDifference(output_ref_block);
+        return;
+      }
+    }
+  }
+
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    Buffer<int16_t> input_extreme_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(input_extreme_block.Init());
+    Buffer<tran_low_t> output_ref_block = Buffer<tran_low_t>(size_, size_, 0);
+    ASSERT_TRUE(output_ref_block.Init());
+    Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(output_block.Init());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with -max_pixel_value_ or max_pixel_value_.
+      if (i == 0) {
+        input_extreme_block.Set(max_pixel_value_);
+      } else if (i == 1) {
+        input_extreme_block.Set(-max_pixel_value_);
+      } else {
+        for (int h = 0; h < size_; ++h) {
+          for (int w = 0; w < size_; ++w) {
+            input_extreme_block
+                .TopLeftPixel()[h * input_extreme_block.stride() + w] =
+                rnd.Rand8() % 2 ? max_pixel_value_ : -max_pixel_value_;
+          }
+        }
+      }
+
+      fwd_txfm_ref(input_extreme_block, &output_ref_block, size_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block, &output_block));
+
+      // The minimum quant value is 4.
+      EXPECT_TRUE(output_block.CheckValues(output_ref_block));
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          EXPECT_GE(
+              4 * DCT_MAX_VALUE << (bit_depth_ - 8),
+              abs(output_block.TopLeftPixel()[h * output_block.stride() + w]))
+              << "Error: 4x4 FDCT has coefficient larger than "
+                 "4*DCT_MAX_VALUE"
+              << " at " << w << "," << h;
+          if (::testing::Test::HasFailure()) {
+            printf("Size: %d Transform type: %d\n", size_, tx_type_);
+            output_block.DumpBuffer();
+            return;
+          }
+        }
+      }
+    }
+  }
+
+  void RunInvAccuracyCheck(int limit) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    Buffer<int16_t> in = Buffer<int16_t>(size_, size_, 4);
+    ASSERT_TRUE(in.Init());
+    Buffer<tran_low_t> coeff = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(coeff.Init());
+    Buffer<uint8_t> dst = Buffer<uint8_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(dst.Init());
+    Buffer<uint8_t> src = Buffer<uint8_t>(size_, size_, 0);
+    ASSERT_TRUE(src.Init());
+    Buffer<uint16_t> dst16 = Buffer<uint16_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(dst16.Init());
+    Buffer<uint16_t> src16 = Buffer<uint16_t>(size_, size_, 0);
+    ASSERT_TRUE(src16.Init());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-max_pixel_value_,
+      // max_pixel_value_].
+      if (bit_depth_ == VPX_BITS_8) {
+        src.Set(&rnd, &ACMRandom::Rand8);
+        dst.Set(&rnd, &ACMRandom::Rand8);
+        for (int h = 0; h < size_; ++h) {
+          for (int w = 0; w < size_; ++w) {
+            in.TopLeftPixel()[h * in.stride() + w] =
+                src.TopLeftPixel()[h * src.stride() + w] -
+                dst.TopLeftPixel()[h * dst.stride() + w];
+          }
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        src16.Set(&rnd, 0, max_pixel_value_);
+        dst16.Set(&rnd, 0, max_pixel_value_);
+        for (int h = 0; h < size_; ++h) {
+          for (int w = 0; w < size_; ++w) {
+            in.TopLeftPixel()[h * in.stride() + w] =
+                src16.TopLeftPixel()[h * src16.stride() + w] -
+                dst16.TopLeftPixel()[h * dst16.stride() + w];
+          }
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      fwd_txfm_ref(in, &coeff, size_, tx_type_);
+
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst.TopLeftPixel()));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16.TopLeftPixel())));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          int diff;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (bit_depth_ != 8) {
+            diff = dst16.TopLeftPixel()[h * dst16.stride() + w] -
+                   src16.TopLeftPixel()[h * src16.stride() + w];
+          } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+            diff = dst.TopLeftPixel()[h * dst.stride() + w] -
+                   src.TopLeftPixel()[h * src.stride() + w];
+#if CONFIG_VP9_HIGHBITDEPTH
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+          const uint32_t error = diff * diff;
+          EXPECT_GE(static_cast<uint32_t>(limit), error)
+              << "Error: " << size_ << "x" << size_ << " IDCT has error "
+              << error << " at " << w << "," << h;
+        }
+      }
+    }
+  }
+
+  FhtFuncRef fwd_txfm_ref;
+  vpx_bit_depth_t bit_depth_;
+  int tx_type_;
+  int max_pixel_value_;
+  int size_;
+};
+
+class TransDCT : public TransTestBase,
+                 public ::testing::TestWithParam<DctParam> {
+ public:
+  TransDCT() {
+    fwd_txfm_ref = fdct_ref;
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    size_ = GET_PARAM(2);
+    tx_type_ = GET_PARAM(3);
+    bit_depth_ = GET_PARAM(4);
+    max_pixel_value_ = (1 << bit_depth_) - 1;
+  }
+
+ protected:
+  void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
+    fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride());
+  }
+
+  void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
+    inv_txfm_(in.TopLeftPixel(), out, in.stride());
+  }
+
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
+};
+
+TEST_P(TransDCT, AccuracyCheck) { RunAccuracyCheck(1); }
+
+TEST_P(TransDCT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransDCT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransDCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, TransDCT,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_10, 32, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_12, 32, 0, VPX_BITS_10),
+        make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 32, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_10, 16, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_12, 16, 0, VPX_BITS_10),
+        make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 16, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_10, 8, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_12, 8, 0, VPX_BITS_10),
+        make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 8, 0, VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 4, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 4, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 4, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, TransDCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 32, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 16, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 8, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 4, 0, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2
+#if !CONFIG_EMULATE_HARDWARE
+#if CONFIG_VP9_HIGHBITDEPTH
+/* TODO:(johannkoenig) Determine why these fail AccuracyCheck
+   make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_12, 32, 0, VPX_BITS_12),
+   make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_12, 16, 0, VPX_BITS_12),
+*/
+INSTANTIATE_TEST_CASE_P(
+    SSE2, TransDCT,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_10, 32, 0,
+                   VPX_BITS_10),
+        make_tuple(&vpx_fdct32x32_sse2, &vpx_idct32x32_1024_add_sse2, 32, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_10, 16, 0,
+                   VPX_BITS_10),
+        make_tuple(&vpx_fdct16x16_sse2, &vpx_idct16x16_256_add_sse2, 16, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_10, 8, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_12, 8, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 8, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10, 4, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12, 4, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_sse2, 4, 0,
+                   VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    SSE2, TransDCT,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_sse2,
+                                 &vpx_idct32x32_1024_add_sse2, 32, 0,
+                                 VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_sse2,
+                                 &vpx_idct16x16_256_add_sse2, 16, 0,
+                                 VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 8,
+                                 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_sse2, 4,
+                                 0, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_SSE2
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE
+#if !ARCH_X86_64
+// TODO(johannkoenig): high bit depth fdct8x8.
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, TransDCT,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2,
+                                 32, 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_sse2, 8, 0,
+                                 VPX_BITS_8)));
+#else
+// vpx_fdct8x8_ssse3 is only available in 64 bit builds.
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, TransDCT,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2,
+                                 32, 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_ssse3, &vpx_idct8x8_64_add_sse2,
+                                 8, 0, VPX_BITS_8)));
+#endif  // !ARCH_X86_64
+#endif  // HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+
+#if !CONFIG_VP9_HIGHBITDEPTH && HAVE_AVX2 && !CONFIG_EMULATE_HARDWARE
+// TODO(johannkoenig): high bit depth fdct32x32.
+INSTANTIATE_TEST_CASE_P(
+    AVX2, TransDCT, ::testing::Values(make_tuple(&vpx_fdct32x32_avx2,
+                                                 &vpx_idct32x32_1024_add_sse2,
+                                                 32, 0, VPX_BITS_8)));
+
+#endif  // !CONFIG_VP9_HIGHBITDEPTH && HAVE_AVX2 && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON
+#if !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    NEON, TransDCT,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_neon,
+                                 &vpx_idct32x32_1024_add_neon, 32, 0,
+                                 VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_neon,
+                                 &vpx_idct16x16_256_add_neon, 16, 0,
+                                 VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 8,
+                                 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_neon, &vpx_idct4x4_16_add_neon, 4,
+                                 0, VPX_BITS_8)));
+#endif  // !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+#if !CONFIG_VP9_HIGHBITDEPTH
+#if !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MSA, TransDCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_msa, &vpx_idct32x32_1024_add_msa, 32, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_fdct16x16_msa, &vpx_idct16x16_256_add_msa, 16, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_fdct8x8_msa, &vpx_idct8x8_64_add_msa, 8, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct4x4_msa, &vpx_idct4x4_16_add_msa, 4, 0,
+                   VPX_BITS_8)));
+#endif  // !CONFIG_EMULATE_HARDWARE
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_MSA
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(VSX, TransDCT,
+                        ::testing::Values(make_tuple(&vpx_fdct4x4_c,
+                                                     &vpx_idct4x4_16_add_vsx, 4,
+                                                     0, VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+class TransHT : public TransTestBase, public ::testing::TestWithParam<HtParam> {
+ public:
+  TransHT() {
+    fwd_txfm_ref = fht_ref;
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    size_ = GET_PARAM(2);
+    tx_type_ = GET_PARAM(3);
+    bit_depth_ = GET_PARAM(4);
+    max_pixel_value_ = (1 << bit_depth_) - 1;
+  }
+
+ protected:
+  void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
+    fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride(), tx_type_);
+  }
+
+  void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
+    inv_txfm_(in.TopLeftPixel(), out, in.stride(), tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(TransHT, AccuracyCheck) { RunAccuracyCheck(1); }
+
+TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransHT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
+
+/* TODO:(johannkoenig) Determine why these fail AccuracyCheck
+   make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 0, VPX_BITS_12),
+   make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 1, VPX_BITS_12),
+   make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 2, VPX_BITS_12),
+   make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 3, VPX_BITS_12),
+  */
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, TransHT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 3, VPX_BITS_10),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 3, VPX_BITS_8),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 3, VPX_BITS_8),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, TransHT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 3, VPX_BITS_8),
+
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 3, VPX_BITS_8),
+
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 3, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, TransHT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 0,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 1,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 2,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 3,
+                   VPX_BITS_8),
+
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 3, VPX_BITS_8),
+
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 3,
+                   VPX_BITS_8)));
+#endif  // HAVE_SSE2
+
+class TransWHT : public TransTestBase,
+                 public ::testing::TestWithParam<DctParam> {
+ public:
+  TransWHT() {
+    fwd_txfm_ref = fwht_ref;
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    size_ = GET_PARAM(2);
+    tx_type_ = GET_PARAM(3);
+    bit_depth_ = GET_PARAM(4);
+    max_pixel_value_ = (1 << bit_depth_) - 1;
+  }
+
+ protected:
+  void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
+    fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride());
+  }
+
+  void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
+    inv_txfm_(in.TopLeftPixel(), out, in.stride());
+  }
+
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
+};
+
+TEST_P(TransWHT, AccuracyCheck) { RunAccuracyCheck(0); }
+
+TEST_P(TransWHT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransWHT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransWHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, TransWHT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 4, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 4, 0, VPX_BITS_12),
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 4, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(C, TransWHT,
+                        ::testing::Values(make_tuple(&vp9_fwht4x4_c,
+                                                     &vpx_iwht4x4_16_add_c, 4,
+                                                     0, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, TransWHT,
+                        ::testing::Values(make_tuple(&vp9_fwht4x4_sse2,
+                                                     &vpx_iwht4x4_16_add_sse2,
+                                                     4, 0, VPX_BITS_8)));
+#endif  // HAVE_SSE2
+}  // namespace
diff --git a/libs/libvpx/test/decode_api_test.cc b/libs/libvpx/test/decode_api_test.cc
index 593637780e..4167cf3e0f 100644
--- a/libs/libvpx/test/decode_api_test.cc
+++ b/libs/libvpx/test/decode_api_test.cc
@@ -172,4 +172,21 @@ TEST(DecodeAPI, Vp9PeekSI) {
 }
 #endif  // CONFIG_VP9_DECODER
 
+TEST(DecodeAPI, HighBitDepthCapability) {
+// VP8 should not claim VP9 HBD as a capability.
+#if CONFIG_VP8_DECODER
+  const vpx_codec_caps_t vp8_caps = vpx_codec_get_caps(&vpx_codec_vp8_dx_algo);
+  EXPECT_EQ(vp8_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0);
+#endif
+
+#if CONFIG_VP9_DECODER
+  const vpx_codec_caps_t vp9_caps = vpx_codec_get_caps(&vpx_codec_vp9_dx_algo);
+#if CONFIG_VP9_HIGHBITDEPTH
+  EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, VPX_CODEC_CAP_HIGHBITDEPTH);
+#else
+  EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0);
+#endif
+#endif
+}
+
 }  // namespace
diff --git a/libs/libvpx/test/decode_svc_test.cc b/libs/libvpx/test/decode_svc_test.cc
new file mode 100644
index 0000000000..69f62f13bd
--- /dev/null
+++ b/libs/libvpx/test/decode_svc_test.cc
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+
+namespace {
+
+const unsigned int kNumFrames = 19;
+
+class DecodeSvcTest : public ::libvpx_test::DecoderTest,
+                      public ::libvpx_test::CodecTestWithParam<const char *> {
+ protected:
+  DecodeSvcTest() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)) {}
+  virtual ~DecodeSvcTest() {}
+
+  virtual void PreDecodeFrameHook(
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    if (video.frame_number() == 0)
+      decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, spatial_layer_);
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     const unsigned int frame_number) {
+    ASSERT_EQ(img.d_w, width_);
+    ASSERT_EQ(img.d_h, height_);
+    total_frames_ = frame_number;
+  }
+
+  int spatial_layer_;
+  unsigned int width_;
+  unsigned int height_;
+  unsigned int total_frames_;
+};
+
+// SVC test vector is 1280x720, with 3 spatial layers, and 20 frames.
+
+// Decode the SVC test vector, which has 3 spatial layers, and decode up to
+// spatial layer 0. Verify the resolution of each decoded frame and the total
+// number of frames decoded. This results in 1/4x1/4 resolution (320x180).
+TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) {
+  const std::string filename = GET_PARAM(1);
+  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  video.reset(new libvpx_test::IVFVideoSource(filename));
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+  total_frames_ = 0;
+  spatial_layer_ = 0;
+  width_ = 320;
+  height_ = 180;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  ASSERT_EQ(total_frames_, kNumFrames);
+}
+
+// Decode the SVC test vector, which has 3 spatial layers, and decode up to
+// spatial layer 1. Verify the resolution of each decoded frame and the total
+// number of frames decoded. This results in 1/2x1/2 resolution (640x360).
+TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) {
+  const std::string filename = GET_PARAM(1);
+  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  video.reset(new libvpx_test::IVFVideoSource(filename));
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+  total_frames_ = 0;
+  spatial_layer_ = 1;
+  width_ = 640;
+  height_ = 360;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  ASSERT_EQ(total_frames_, kNumFrames);
+}
+
+// Decode the SVC test vector, which has 3 spatial layers, and decode up to
+// spatial layer 2. Verify the resolution of each decoded frame and the total
+// number of frames decoded. This results in the full resolution (1280x720).
+TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) {
+  const std::string filename = GET_PARAM(1);
+  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  video.reset(new libvpx_test::IVFVideoSource(filename));
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+  total_frames_ = 0;
+  spatial_layer_ = 2;
+  width_ = 1280;
+  height_ = 720;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  ASSERT_EQ(total_frames_, kNumFrames);
+}
+
+// Decode the SVC test vector, which has 3 spatial layers, and decode up to
+// spatial layer 10. Verify the resolution of each decoded frame and the total
+// number of frames decoded. This is beyond the number of spatial layers, so
+// the decoding should result in the full resolution (1280x720).
+TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer10) {
+  const std::string filename = GET_PARAM(1);
+  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  video.reset(new libvpx_test::IVFVideoSource(filename));
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+  total_frames_ = 0;
+  spatial_layer_ = 10;
+  width_ = 1280;
+  height_ = 720;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  ASSERT_EQ(total_frames_, kNumFrames);
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    DecodeSvcTest, ::testing::ValuesIn(libvpx_test::kVP9TestVectorsSvc,
+                                       libvpx_test::kVP9TestVectorsSvc +
+                                           libvpx_test::kNumVP9TestVectorsSvc));
+}  // namespace
diff --git a/libs/libvpx/test/decode_test_driver.cc b/libs/libvpx/test/decode_test_driver.cc
index b738e0db1c..48680eb8e9 100644
--- a/libs/libvpx/test/decode_test_driver.cc
+++ b/libs/libvpx/test/decode_test_driver.cc
@@ -53,13 +53,13 @@ void DecoderTest::HandlePeekResult(Decoder *const decoder,
      * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first
      * frame, which must be a keyframe. */
     if (video->frame_number() == 0)
-      ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
-                                        << vpx_codec_err_to_string(res_peek);
+      ASSERT_EQ(VPX_CODEC_OK, res_peek)
+          << "Peek return failed: " << vpx_codec_err_to_string(res_peek);
   } else {
     /* The Vp9 implementation of PeekStream returns an error only if the
      * data passed to it isn't a valid Vp9 chunk. */
-    ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
-                                      << vpx_codec_err_to_string(res_peek);
+    ASSERT_EQ(VPX_CODEC_OK, res_peek)
+        << "Peek return failed: " << vpx_codec_err_to_string(res_peek);
   }
 }
 
diff --git a/libs/libvpx/test/encode_api_test.cc b/libs/libvpx/test/encode_api_test.cc
index 419e38506c..87e29b61db 100644
--- a/libs/libvpx/test/encode_api_test.cc
+++ b/libs/libvpx/test/encode_api_test.cc
@@ -62,4 +62,134 @@ TEST(EncodeAPI, InvalidParams) {
   }
 }
 
+TEST(EncodeAPI, HighBitDepthCapability) {
+// VP8 should not claim VP9 HBD as a capability.
+#if CONFIG_VP8_ENCODER
+  const vpx_codec_caps_t vp8_caps = vpx_codec_get_caps(&vpx_codec_vp8_cx_algo);
+  EXPECT_EQ(vp8_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0);
+#endif
+
+#if CONFIG_VP9_ENCODER
+  const vpx_codec_caps_t vp9_caps = vpx_codec_get_caps(&vpx_codec_vp9_cx_algo);
+#if CONFIG_VP9_HIGHBITDEPTH
+  EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, VPX_CODEC_CAP_HIGHBITDEPTH);
+#else
+  EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0);
+#endif
+#endif
+}
+
+#if CONFIG_VP8_ENCODER
+TEST(EncodeAPI, ImageSizeSetting) {
+  const int width = 711;
+  const int height = 360;
+  const int bps = 12;
+  vpx_image_t img;
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+  uint8_t *img_buf = reinterpret_cast<uint8_t *>(
+      calloc(width * height * bps / 8, sizeof(*img_buf)));
+  vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &cfg, 0);
+
+  cfg.g_w = width;
+  cfg.g_h = height;
+
+  vpx_img_wrap(&img, VPX_IMG_FMT_I420, width, height, 1, img_buf);
+
+  vpx_codec_enc_init(&enc, vpx_codec_vp8_cx(), &cfg, 0);
+
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, &img, 0, 1, 0, 0));
+
+  free(img_buf);
+
+  vpx_codec_destroy(&enc);
+}
+#endif
+
+// Set up 2 spatial streams with 2 temporal layers per stream, and generate
+// invalid configuration by setting the temporal layer rate allocation
+// (ts_target_bitrate[]) to 0 for both layers. This should fail independent of
+// CONFIG_MULTI_RES_ENCODING.
+TEST(EncodeAPI, MultiResEncode) {
+  static const vpx_codec_iface_t *kCodecs[] = {
+#if CONFIG_VP8_ENCODER
+    &vpx_codec_vp8_cx_algo,
+#endif
+#if CONFIG_VP9_ENCODER
+    &vpx_codec_vp9_cx_algo,
+#endif
+  };
+  const int width = 1280;
+  const int height = 720;
+  const int width_down = width / 2;
+  const int height_down = height / 2;
+  const int target_bitrate = 1000;
+  const int framerate = 30;
+
+  for (int c = 0; c < NELEMENTS(kCodecs); ++c) {
+    const vpx_codec_iface_t *const iface = kCodecs[c];
+    vpx_codec_ctx_t enc[2];
+    vpx_codec_enc_cfg_t cfg[2];
+    vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } };
+
+    memset(enc, 0, sizeof(enc));
+
+    for (int i = 0; i < 2; i++) {
+      vpx_codec_enc_config_default(iface, &cfg[i], 0);
+    }
+
+    /* Highest-resolution encoder settings */
+    cfg[0].g_w = width;
+    cfg[0].g_h = height;
+    cfg[0].rc_dropframe_thresh = 0;
+    cfg[0].rc_end_usage = VPX_CBR;
+    cfg[0].rc_resize_allowed = 0;
+    cfg[0].rc_min_quantizer = 2;
+    cfg[0].rc_max_quantizer = 56;
+    cfg[0].rc_undershoot_pct = 100;
+    cfg[0].rc_overshoot_pct = 15;
+    cfg[0].rc_buf_initial_sz = 500;
+    cfg[0].rc_buf_optimal_sz = 600;
+    cfg[0].rc_buf_sz = 1000;
+    cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
+    cfg[0].g_lag_in_frames = 0;
+
+    cfg[0].kf_mode = VPX_KF_AUTO;
+    cfg[0].kf_min_dist = 3000;
+    cfg[0].kf_max_dist = 3000;
+
+    cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */
+    cfg[0].g_timebase.num = 1;                 /* Set fps */
+    cfg[0].g_timebase.den = framerate;
+
+    memcpy(&cfg[1], &cfg[0], sizeof(cfg[0]));
+    cfg[1].rc_target_bitrate = 500;
+    cfg[1].g_w = width_down;
+    cfg[1].g_h = height_down;
+
+    for (int i = 0; i < 2; i++) {
+      cfg[i].ts_number_layers = 2;
+      cfg[i].ts_periodicity = 2;
+      cfg[i].ts_rate_decimator[0] = 2;
+      cfg[i].ts_rate_decimator[1] = 1;
+      cfg[i].ts_layer_id[0] = 0;
+      cfg[i].ts_layer_id[1] = 1;
+      // Invalid parameters.
+      cfg[i].ts_target_bitrate[0] = 0;
+      cfg[i].ts_target_bitrate[1] = 0;
+    }
+
+    // VP9 should report incapable, VP8 invalid for all configurations.
+    const char kVP9Name[] = "WebM Project VP9";
+    const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface),
+                                sizeof(kVP9Name) - 1) == 0;
+    EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
+              vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0]));
+
+    for (int i = 0; i < 2; i++) {
+      vpx_codec_destroy(&enc[i]);
+    }
+  }
+}
+
 }  // namespace
diff --git a/libs/libvpx/test/encode_test_driver.cc b/libs/libvpx/test/encode_test_driver.cc
index 632c98f05a..b2cbc3f05b 100644
--- a/libs/libvpx/test/encode_test_driver.cc
+++ b/libs/libvpx/test/encode_test_driver.cc
@@ -201,6 +201,8 @@ void EncoderTest::RunLoop(VideoSource *video) {
       PreEncodeFrameHook(video, encoder.get());
       encoder->EncodeFrame(video, frame_flags_);
 
+      PostEncodeFrameHook(encoder.get());
+
       CxDataIterator iter = encoder->GetCxData();
 
       bool has_cxdata = false;
@@ -226,6 +228,8 @@ void EncoderTest::RunLoop(VideoSource *video) {
 
           case VPX_CODEC_PSNR_PKT: PSNRPktHook(pkt); break;
 
+          case VPX_CODEC_STATS_PKT: StatsPktHook(pkt); break;
+
           default: break;
         }
       }
diff --git a/libs/libvpx/test/encode_test_driver.h b/libs/libvpx/test/encode_test_driver.h
index 09b1a78344..89a3b1767e 100644
--- a/libs/libvpx/test/encode_test_driver.h
+++ b/libs/libvpx/test/encode_test_driver.h
@@ -139,6 +139,13 @@ class Encoder {
   }
 #endif
 
+#if CONFIG_VP8_ENCODER
+  void Control(int ctrl_id, vpx_roi_map_t *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+#endif
+
   void Config(const vpx_codec_enc_cfg_t *cfg) {
     const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
@@ -212,12 +219,17 @@ class EncoderTest {
   virtual void PreEncodeFrameHook(VideoSource * /*video*/,
                                   Encoder * /*encoder*/) {}
 
+  virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {}
+
   // Hook to be called on every compressed data packet.
   virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}
 
   // Hook to be called on every PSNR packet.
   virtual void PSNRPktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}
 
+  // Hook to be called on every first pass stats packet.
+  virtual void StatsPktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}
+
   // Hook to determine whether the encode loop should continue.
   virtual bool Continue() const {
     return !(::testing::Test::HasFatalFailure() || abort_);
diff --git a/libs/libvpx/test/error_resilience_test.cc b/libs/libvpx/test/error_resilience_test.cc
index 4c7ede8cae..030b67c572 100644
--- a/libs/libvpx/test/error_resilience_test.cc
+++ b/libs/libvpx/test/error_resilience_test.cc
@@ -90,8 +90,7 @@ class ErrorResilienceTestLarge
     return frame_flags;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder * /*encoder*/) {
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video) {
     frame_flags_ &=
         ~(VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF);
     // For temporal layer case.
diff --git a/libs/libvpx/test/examples.sh b/libs/libvpx/test/examples.sh
index 39f7e392db..629f04239c 100755
--- a/libs/libvpx/test/examples.sh
+++ b/libs/libvpx/test/examples.sh
@@ -15,7 +15,7 @@
 example_tests=$(ls $(dirname $0)/*.sh)
 
 # List of script names to exclude.
-exclude_list="examples tools_common"
+exclude_list="examples stress tools_common"
 
 # Filter out the scripts in $exclude_list.
 for word in ${exclude_list}; do
diff --git a/libs/libvpx/test/external_frame_buffer_test.cc b/libs/libvpx/test/external_frame_buffer_test.cc
index f9686695a7..dbf2971198 100644
--- a/libs/libvpx/test/external_frame_buffer_test.cc
+++ b/libs/libvpx/test/external_frame_buffer_test.cc
@@ -34,7 +34,8 @@ struct ExternalFrameBuffer {
 // Class to manipulate a list of external frame buffers.
 class ExternalFrameBufferList {
  public:
-  ExternalFrameBufferList() : num_buffers_(0), ext_fb_list_(NULL) {}
+  ExternalFrameBufferList()
+      : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(NULL) {}
 
   virtual ~ExternalFrameBufferList() {
     for (int i = 0; i < num_buffers_; ++i) {
@@ -71,6 +72,8 @@ class ExternalFrameBufferList {
     }
 
     SetFrameBuffer(idx, fb);
+
+    num_used_buffers_++;
     return 0;
   }
 
@@ -106,6 +109,7 @@ class ExternalFrameBufferList {
     }
     EXPECT_EQ(1, ext_fb->in_use);
     ext_fb->in_use = 0;
+    num_used_buffers_--;
     return 0;
   }
 
@@ -121,6 +125,8 @@ class ExternalFrameBufferList {
     }
   }
 
+  int num_used_buffers() const { return num_used_buffers_; }
+
  private:
   // Returns the index of the first free frame buffer. Returns |num_buffers_|
   // if there are no free frame buffers.
@@ -145,6 +151,7 @@ class ExternalFrameBufferList {
   }
 
   int num_buffers_;
+  int num_used_buffers_;
   ExternalFrameBuffer *ext_fb_list_;
 };
 
@@ -220,8 +227,8 @@ class ExternalFrameBufferMD5Test
 
   void OpenMD5File(const std::string &md5_file_name_) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
-                                   << md5_file_name_;
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
   virtual void DecompressedFrameHook(const vpx_image_t &img,
@@ -273,6 +280,7 @@ class ExternalFrameBufferMD5Test
 
 #if CONFIG_WEBM_IO
 const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
+const char kVP9NonRefTestFile[] = "vp90-2-22-svc_1280x720_1.webm";
 
 // Class for testing passing in external frame buffers to libvpx.
 class ExternalFrameBufferTest : public ::testing::Test {
@@ -292,7 +300,9 @@ class ExternalFrameBufferTest : public ::testing::Test {
 
   virtual void TearDown() {
     delete decoder_;
+    decoder_ = NULL;
     delete video_;
+    video_ = NULL;
   }
 
   // Passes the external frame buffer information to libvpx.
@@ -325,7 +335,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
     return VPX_CODEC_OK;
   }
 
- private:
+ protected:
   void CheckDecodedFrames() {
     libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
     const vpx_image_t *img = NULL;
@@ -341,6 +351,25 @@ class ExternalFrameBufferTest : public ::testing::Test {
   int num_buffers_;
   ExternalFrameBufferList fb_list_;
 };
+
+class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
+ protected:
+  virtual void SetUp() {
+    video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+  }
+
+  virtual void CheckFrameBufferRelease() {
+    TearDown();
+    ASSERT_EQ(0, fb_list_.num_used_buffers());
+  }
+};
 #endif  // CONFIG_WEBM_IO
 
 // This test runs through the set of test vectors, and decodes them.
@@ -419,6 +448,8 @@ TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) {
             SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
                                     release_vp9_frame_buffer));
   ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame());
+  // Only run this on long clips. Decoding a very short clip will return
+  // VPX_CODEC_OK even with only 2 buffers.
   ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames());
 }
 
@@ -467,6 +498,15 @@ TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
             SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
                                     release_vp9_frame_buffer));
 }
+
+TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
+                                    release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames());
+  CheckFrameBufferRelease();
+}
 #endif  // CONFIG_WEBM_IO
 
 VP9_INSTANTIATE_TEST_CASE(
diff --git a/libs/libvpx/test/fdct4x4_test.cc b/libs/libvpx/test/fdct4x4_test.cc
deleted file mode 100644
index 1270dae94a..0000000000
--- a/libs/libvpx/test/fdct4x4_test.cc
+++ /dev/null
@@ -1,512 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vp9_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "vp9/common/vp9_entropy.h"
-#include "vpx/vpx_codec.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-using libvpx_test::ACMRandom;
-
-namespace {
-const int kNumCoeffs = 16;
-typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
-                        int tx_type);
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        int tx_type);
-
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;
-
-void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                 int /*tx_type*/) {
-  vpx_fdct4x4_c(in, out, stride);
-}
-
-void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
-  vp9_fht4x4_c(in, out, stride, tx_type);
-}
-
-void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                 int /*tx_type*/) {
-  vp9_fwht4x4_c(in, out, stride);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_c(in, out, stride, 10);
-}
-
-void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_c(in, out, stride, 12);
-}
-
-void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 10);
-}
-
-void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 12);
-}
-
-void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_iwht4x4_16_add_c(in, out, stride, 10);
-}
-
-void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_iwht4x4_16_add_c(in, out, stride, 12);
-}
-
-#if HAVE_SSE2
-void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 10);
-}
-
-void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 12);
-}
-#endif  // HAVE_SSE2
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-class Trans4x4TestBase {
- public:
-  virtual ~Trans4x4TestBase() {}
-
- protected:
-  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
-
-  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
-
-  void RunAccuracyCheck(int limit) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    uint32_t max_error = 0;
-    int64_t total_error = 0;
-    const int count_test_block = 10000;
-    for (int i = 0; i < count_test_block; ++i) {
-      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_VP9_HIGHBITDEPTH
-      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == VPX_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          test_input_block[j] = src[j] - dst[j];
-#if CONFIG_VP9_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          test_input_block[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
-      if (bit_depth_ == VPX_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        ASSERT_EQ(VPX_BITS_8, bit_depth_);
-        const int diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        if (max_error < error) max_error = error;
-        total_error += error;
-      }
-    }
-
-    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
-        << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit;
-
-    EXPECT_GE(count_test_block * limit, total_error)
-        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
-        << " per block";
-  }
-
-  void RunCoeffCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-      }
-
-      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-    }
-  }
-
-  void RunMemCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
-      }
-      if (i == 0) {
-        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_;
-      } else if (i == 1) {
-        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_;
-      }
-
-      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(input_extreme_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
-            << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      }
-    }
-  }
-
-  void RunInvAccuracyCheck(int limit) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_VP9_HIGHBITDEPTH
-    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == VPX_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          in[j] = src[j] - dst[j];
-#if CONFIG_VP9_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          in[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      fwd_txfm_ref(in, coeff, pitch_, tx_type_);
-
-      if (bit_depth_ == VPX_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        const int diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        EXPECT_GE(static_cast<uint32_t>(limit), error)
-            << "Error: 4x4 IDCT has error " << error << " at index " << j;
-      }
-    }
-  }
-
-  int pitch_;
-  int tx_type_;
-  FhtFunc fwd_txfm_ref;
-  vpx_bit_depth_t bit_depth_;
-  int mask_;
-};
-
-class Trans4x4DCT : public Trans4x4TestBase,
-                    public ::testing::TestWithParam<Dct4x4Param> {
- public:
-  virtual ~Trans4x4DCT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_ = GET_PARAM(2);
-    pitch_ = 4;
-    fwd_txfm_ref = fdct4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4DCT, AccuracyCheck) { RunAccuracyCheck(1); }
-
-TEST_P(Trans4x4DCT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4DCT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4DCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-
-class Trans4x4HT : public Trans4x4TestBase,
-                   public ::testing::TestWithParam<Ht4x4Param> {
- public:
-  virtual ~Trans4x4HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_ = GET_PARAM(2);
-    pitch_ = 4;
-    fwd_txfm_ref = fht4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, tx_type_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, tx_type_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4HT, AccuracyCheck) { RunAccuracyCheck(1); }
-
-TEST_P(Trans4x4HT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4HT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-
-class Trans4x4WHT : public Trans4x4TestBase,
-                    public ::testing::TestWithParam<Dct4x4Param> {
- public:
-  virtual ~Trans4x4WHT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_ = GET_PARAM(2);
-    pitch_ = 4;
-    fwd_txfm_ref = fwht4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0); }
-
-TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
-using std::tr1::make_tuple;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4DCT,
-    ::testing::Values(
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(C, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_c,
-                                                     &vpx_idct4x4_16_add_c, 0,
-                                                     VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4WHT,
-    ::testing::Values(
-        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(C, Trans4x4WHT,
-                        ::testing::Values(make_tuple(&vp9_fwht4x4_c,
-                                                     &vpx_iwht4x4_16_add_c, 0,
-                                                     VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_c,
-                                                     &vpx_idct4x4_16_add_neon,
-                                                     0, VPX_BITS_8)));
-#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    NEON, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4WHT,
-    ::testing::Values(
-        make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
-#endif
-
-#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(SSE2, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_sse2,
-                                                     &vpx_idct4x4_16_add_sse2,
-                                                     0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
-#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4DCT,
-    ::testing::Values(
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10_sse2, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12_sse2, 0, VPX_BITS_12),
-        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
-        make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
-
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(MSA, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_msa,
-                                                     &vpx_idct4x4_16_add_msa, 0,
-                                                     VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    MSA, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8)));
-#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-}  // namespace
diff --git a/libs/libvpx/test/fdct8x8_test.cc b/libs/libvpx/test/fdct8x8_test.cc
index 3ac73c1250..5021dda9b3 100644
--- a/libs/libvpx/test/fdct8x8_test.cc
+++ b/libs/libvpx/test/fdct8x8_test.cc
@@ -88,45 +88,45 @@ void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_c(in, out, stride, 10);
+  vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_c(in, out, stride, 12);
+  vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 
 void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 10);
+  vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
 }
 
 void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
+  vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
 }
 
 #if HAVE_SSE2
 
-void idct8x8_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_10_add_c(in, out, stride, 10);
+void idct8x8_12_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
-void idct8x8_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_10_add_c(in, out, stride, 12);
+void idct8x8_12_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 
-void idct8x8_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_10_add_sse2(in, out, stride, 10);
+void idct8x8_12_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
-void idct8x8_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_10_add_sse2(in, out, stride, 12);
+void idct8x8_12_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 
 void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 #endif  // HAVE_SSE2
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -257,7 +257,7 @@ class FwdTrans8x8TestBase {
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
         ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
       }
 
@@ -340,7 +340,7 @@ class FwdTrans8x8TestBase {
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
         ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
       }
 
@@ -413,7 +413,7 @@ class FwdTrans8x8TestBase {
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
         ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
       }
 
@@ -497,9 +497,9 @@ class FwdTrans8x8TestBase {
         ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
         ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
       }
 
@@ -511,8 +511,8 @@ class FwdTrans8x8TestBase {
         const int diff = dst[j] - ref[j];
 #endif
         const uint32_t error = diff * diff;
-        EXPECT_EQ(0u, error) << "Error: 8x8 IDCT has error " << error
-                             << " at index " << j;
+        EXPECT_EQ(0u, error)
+            << "Error: 8x8 IDCT has error " << error << " at index " << j;
       }
     }
   }
@@ -670,14 +670,12 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&vpx_fdct8x8_neon,
                                                      &vpx_idct8x8_64_add_neon,
                                                      0, VPX_BITS_8)));
-#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     NEON, FwdTrans8x8HT,
     ::testing::Values(
@@ -685,7 +683,8 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(SSE2, FwdTrans8x8DCT,
@@ -728,10 +727,10 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2, InvTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&idct8x8_10_add_10_c, &idct8x8_10_add_10_sse2, 6225,
+        make_tuple(&idct8x8_12_add_10_c, &idct8x8_12_add_10_sse2, 6225,
                    VPX_BITS_10),
         make_tuple(&idct8x8_10, &idct8x8_64_add_10_sse2, 6225, VPX_BITS_10),
-        make_tuple(&idct8x8_10_add_12_c, &idct8x8_10_add_12_sse2, 6225,
+        make_tuple(&idct8x8_12_add_12_c, &idct8x8_12_add_12_sse2, 6225,
                    VPX_BITS_12),
         make_tuple(&idct8x8_12, &idct8x8_64_add_12_sse2, 6225, VPX_BITS_12)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -740,7 +739,7 @@ INSTANTIATE_TEST_CASE_P(
     !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&vpx_fdct8x8_ssse3,
-                                                     &vpx_idct8x8_64_add_ssse3,
+                                                     &vpx_idct8x8_64_add_sse2,
                                                      0, VPX_BITS_8)));
 #endif
 
@@ -757,4 +756,11 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 2, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 3, VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(VSX, FwdTrans8x8DCT,
+                        ::testing::Values(make_tuple(&vpx_fdct8x8_c,
+                                                     &vpx_idct8x8_64_add_vsx, 0,
+                                                     VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/libs/libvpx/test/hadamard_test.cc b/libs/libvpx/test/hadamard_test.cc
index e7715958ed..3b7cfeddcf 100644
--- a/libs/libvpx/test/hadamard_test.cc
+++ b/libs/libvpx/test/hadamard_test.cc
@@ -13,6 +13,7 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/vpx_timer.h"
 
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
@@ -21,7 +22,8 @@ namespace {
 
 using ::libvpx_test::ACMRandom;
 
-typedef void (*HadamardFunc)(const int16_t *a, int a_stride, int16_t *b);
+typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride,
+                             tran_low_t *b);
 
 void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
   int16_t b[8];
@@ -46,18 +48,16 @@ void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
   out[5] = c[3] - c[7];
 }
 
-void reference_hadamard8x8(const int16_t *a, int a_stride, int16_t *b) {
+void reference_hadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) {
   int16_t buf[64];
-  for (int i = 0; i < 8; ++i) {
-    hadamard_loop(a + i, a_stride, buf + i * 8);
-  }
+  int16_t buf2[64];
+  for (int i = 0; i < 8; ++i) hadamard_loop(a + i, a_stride, buf + i * 8);
+  for (int i = 0; i < 8; ++i) hadamard_loop(buf + i, 8, buf2 + i * 8);
 
-  for (int i = 0; i < 8; ++i) {
-    hadamard_loop(buf + i, 8, b + i * 8);
-  }
+  for (int i = 0; i < 64; ++i) b[i] = (tran_low_t)buf2[i];
 }
 
-void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) {
+void reference_hadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) {
   /* The source is a 16x16 block. The destination is rearranged to 8x32.
    * Input is 9 bit. */
   reference_hadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
@@ -68,16 +68,16 @@ void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) {
   /* Overlay the 8x8 blocks and combine. */
   for (int i = 0; i < 64; ++i) {
     /* 8x8 steps the range up to 15 bits. */
-    const int16_t a0 = b[0];
-    const int16_t a1 = b[64];
-    const int16_t a2 = b[128];
-    const int16_t a3 = b[192];
+    const tran_low_t a0 = b[0];
+    const tran_low_t a1 = b[64];
+    const tran_low_t a2 = b[128];
+    const tran_low_t a3 = b[192];
 
     /* Prevent the result from escaping int16_t. */
-    const int16_t b0 = (a0 + a1) >> 1;
-    const int16_t b1 = (a0 - a1) >> 1;
-    const int16_t b2 = (a2 + a3) >> 1;
-    const int16_t b3 = (a2 - a3) >> 1;
+    const tran_low_t b0 = (a0 + a1) >> 1;
+    const tran_low_t b1 = (a0 - a1) >> 1;
+    const tran_low_t b2 = (a2 + a3) >> 1;
+    const tran_low_t b3 = (a2 - a3) >> 1;
 
     /* Store a 16 bit value. */
     b[0] = b0 + b2;
@@ -101,12 +101,35 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFunc> {
   ACMRandom rnd_;
 };
 
+void HadamardSpeedTest(const char *name, HadamardFunc const func,
+                       const int16_t *input, int stride, tran_low_t *output,
+                       int times) {
+  int i;
+  vpx_usec_timer timer;
+
+  vpx_usec_timer_start(&timer);
+  for (i = 0; i < times; ++i) {
+    func(input, stride, output);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("%s[%12d runs]: %d us\n", name, times, elapsed_time);
+}
+
 class Hadamard8x8Test : public HadamardTestBase {};
 
+void HadamardSpeedTest8x8(HadamardFunc const func, int times) {
+  DECLARE_ALIGNED(16, int16_t, input[64]);
+  DECLARE_ALIGNED(16, tran_low_t, output[64]);
+  memset(input, 1, sizeof(input));
+  HadamardSpeedTest("Hadamard8x8", func, input, 8, output, times);
+}
+
 TEST_P(Hadamard8x8Test, CompareReferenceRandom) {
   DECLARE_ALIGNED(16, int16_t, a[64]);
-  DECLARE_ALIGNED(16, int16_t, b[64]);
-  int16_t b_ref[64];
+  DECLARE_ALIGNED(16, tran_low_t, b[64]);
+  tran_low_t b_ref[64];
   for (int i = 0; i < 64; ++i) {
     a[i] = rnd_.Rand9Signed();
   }
@@ -124,8 +147,8 @@ TEST_P(Hadamard8x8Test, CompareReferenceRandom) {
 
 TEST_P(Hadamard8x8Test, VaryStride) {
   DECLARE_ALIGNED(16, int16_t, a[64 * 8]);
-  DECLARE_ALIGNED(16, int16_t, b[64]);
-  int16_t b_ref[64];
+  DECLARE_ALIGNED(16, tran_low_t, b[64]);
+  tran_low_t b_ref[64];
   for (int i = 0; i < 64 * 8; ++i) {
     a[i] = rnd_.Rand9Signed();
   }
@@ -144,6 +167,12 @@ TEST_P(Hadamard8x8Test, VaryStride) {
   }
 }
 
+TEST_P(Hadamard8x8Test, DISABLED_Speed) {
+  HadamardSpeedTest8x8(h_func_, 10);
+  HadamardSpeedTest8x8(h_func_, 10000);
+  HadamardSpeedTest8x8(h_func_, 10000000);
+}
+
 INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
                         ::testing::Values(&vpx_hadamard_8x8_c));
 
@@ -162,12 +191,33 @@ INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test,
                         ::testing::Values(&vpx_hadamard_8x8_neon));
 #endif  // HAVE_NEON
 
+// TODO(jingning): Remove highbitdepth flag when the SIMD functions are
+// in place and turn on the unit test.
+#if !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_msa));
+#endif  // HAVE_MSA
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(VSX, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_vsx));
+#endif  // HAVE_VSX
+
 class Hadamard16x16Test : public HadamardTestBase {};
 
+void HadamardSpeedTest16x16(HadamardFunc const func, int times) {
+  DECLARE_ALIGNED(16, int16_t, input[256]);
+  DECLARE_ALIGNED(16, tran_low_t, output[256]);
+  memset(input, 1, sizeof(input));
+  HadamardSpeedTest("Hadamard16x16", func, input, 16, output, times);
+}
+
 TEST_P(Hadamard16x16Test, CompareReferenceRandom) {
   DECLARE_ALIGNED(16, int16_t, a[16 * 16]);
-  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
-  int16_t b_ref[16 * 16];
+  DECLARE_ALIGNED(16, tran_low_t, b[16 * 16]);
+  tran_low_t b_ref[16 * 16];
   for (int i = 0; i < 16 * 16; ++i) {
     a[i] = rnd_.Rand9Signed();
   }
@@ -185,8 +235,8 @@ TEST_P(Hadamard16x16Test, CompareReferenceRandom) {
 
 TEST_P(Hadamard16x16Test, VaryStride) {
   DECLARE_ALIGNED(16, int16_t, a[16 * 16 * 8]);
-  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
-  int16_t b_ref[16 * 16];
+  DECLARE_ALIGNED(16, tran_low_t, b[16 * 16]);
+  tran_low_t b_ref[16 * 16];
   for (int i = 0; i < 16 * 16 * 8; ++i) {
     a[i] = rnd_.Rand9Signed();
   }
@@ -205,6 +255,12 @@ TEST_P(Hadamard16x16Test, VaryStride) {
   }
 }
 
+TEST_P(Hadamard16x16Test, DISABLED_Speed) {
+  HadamardSpeedTest16x16(h_func_, 10);
+  HadamardSpeedTest16x16(h_func_, 10000);
+  HadamardSpeedTest16x16(h_func_, 10000000);
+}
+
 INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
                         ::testing::Values(&vpx_hadamard_16x16_c));
 
@@ -213,8 +269,25 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
                         ::testing::Values(&vpx_hadamard_16x16_sse2));
 #endif  // HAVE_SSE2
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_avx2));
+#endif  // HAVE_AVX2
+
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(VSX, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_vsx));
+#endif  // HAVE_VSX
+
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test,
                         ::testing::Values(&vpx_hadamard_16x16_neon));
 #endif  // HAVE_NEON
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_msa));
+#endif  // HAVE_MSA
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/libs/libvpx/test/idct_test.cc b/libs/libvpx/test/idct_test.cc
index f54f2c005a..3700374d7a 100644
--- a/libs/libvpx/test/idct_test.cc
+++ b/libs/libvpx/test/idct_test.cc
@@ -13,6 +13,7 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "test/buffer.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "vpx/vpx_integer.h"
@@ -21,106 +22,156 @@ typedef void (*IdctFunc)(int16_t *input, unsigned char *pred_ptr,
                          int pred_stride, unsigned char *dst_ptr,
                          int dst_stride);
 namespace {
+
+using libvpx_test::Buffer;
+
 class IDCTTest : public ::testing::TestWithParam<IdctFunc> {
  protected:
   virtual void SetUp() {
-    int i;
-
     UUT = GetParam();
-    memset(input, 0, sizeof(input));
-    /* Set up guard blocks */
-    for (i = 0; i < 256; i++) output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
+
+    input = new Buffer<int16_t>(4, 4, 0);
+    ASSERT_TRUE(input != NULL);
+    ASSERT_TRUE(input->Init());
+    predict = new Buffer<uint8_t>(4, 4, 3);
+    ASSERT_TRUE(predict != NULL);
+    ASSERT_TRUE(predict->Init());
+    output = new Buffer<uint8_t>(4, 4, 3);
+    ASSERT_TRUE(output != NULL);
+    ASSERT_TRUE(output->Init());
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  virtual void TearDown() {
+    delete input;
+    delete predict;
+    delete output;
+    libvpx_test::ClearSystemState();
+  }
 
   IdctFunc UUT;
-  int16_t input[16];
-  unsigned char output[256];
-  unsigned char predict[256];
+  Buffer<int16_t> *input;
+  Buffer<uint8_t> *predict;
+  Buffer<uint8_t> *output;
 };
 
-TEST_P(IDCTTest, TestGuardBlocks) {
-  int i;
-
-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(0, output[i]) << i;
-    else
-      EXPECT_EQ(255, output[i]);
-  }
-}
-
 TEST_P(IDCTTest, TestAllZeros) {
-  int i;
+  // When the input is '0' the output will be '0'.
+  input->Set(0);
+  predict->Set(0);
+  output->Set(0);
 
-  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+  ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(),
+                               predict->stride(), output->TopLeftPixel(),
+                               output->stride()));
 
-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(0, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
-  }
+  ASSERT_TRUE(input->CheckValues(0));
+  ASSERT_TRUE(input->CheckPadding());
+  ASSERT_TRUE(output->CheckValues(0));
+  ASSERT_TRUE(output->CheckPadding());
 }
 
 TEST_P(IDCTTest, TestAllOnes) {
-  int i;
+  input->Set(0);
+  // When the first element is '4' it will fill the output buffer with '1'.
+  input->TopLeftPixel()[0] = 4;
+  predict->Set(0);
+  output->Set(0);
 
-  input[0] = 4;
-  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+  ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(),
+                               predict->stride(), output->TopLeftPixel(),
+                               output->stride()));
 
-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(1, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
-  }
+  ASSERT_TRUE(output->CheckValues(1));
+  ASSERT_TRUE(output->CheckPadding());
 }
 
 TEST_P(IDCTTest, TestAddOne) {
-  int i;
+  // Set the transform output to '1' and make sure it gets added to the
+  // prediction buffer.
+  input->Set(0);
+  input->TopLeftPixel()[0] = 4;
+  output->Set(0);
 
-  for (i = 0; i < 256; i++) predict[i] = i;
-  input[0] = 4;
-  ASM_REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
-
-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(i + 1, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
+  uint8_t *pred = predict->TopLeftPixel();
+  for (int y = 0; y < 4; ++y) {
+    for (int x = 0; x < 4; ++x) {
+      pred[y * predict->stride() + x] = y * 4 + x;
+    }
   }
+
+  ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(),
+                               predict->stride(), output->TopLeftPixel(),
+                               output->stride()));
+
+  uint8_t const *out = output->TopLeftPixel();
+  for (int y = 0; y < 4; ++y) {
+    for (int x = 0; x < 4; ++x) {
+      EXPECT_EQ(1 + y * 4 + x, out[y * output->stride() + x]);
+    }
+  }
+
+  if (HasFailure()) {
+    output->DumpBuffer();
+  }
+
+  ASSERT_TRUE(output->CheckPadding());
 }
 
 TEST_P(IDCTTest, TestWithData) {
-  int i;
+  // Test a single known input.
+  predict->Set(0);
 
-  for (i = 0; i < 16; i++) input[i] = i;
-
-  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
-
-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) > 3 || i > 63)
-      EXPECT_EQ(255, output[i]) << "i==" << i;
-    else if (i == 0)
-      EXPECT_EQ(11, output[i]) << "i==" << i;
-    else if (i == 34)
-      EXPECT_EQ(1, output[i]) << "i==" << i;
-    else if (i == 2 || i == 17 || i == 32)
-      EXPECT_EQ(3, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(0, output[i]) << "i==" << i;
+  int16_t *in = input->TopLeftPixel();
+  for (int y = 0; y < 4; ++y) {
+    for (int x = 0; x < 4; ++x) {
+      in[y * input->stride() + x] = y * 4 + x;
+    }
   }
+
+  ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(),
+                               predict->stride(), output->TopLeftPixel(),
+                               output->stride()));
+
+  uint8_t *out = output->TopLeftPixel();
+  for (int y = 0; y < 4; ++y) {
+    for (int x = 0; x < 4; ++x) {
+      switch (y * 4 + x) {
+        case 0: EXPECT_EQ(11, out[y * output->stride() + x]); break;
+        case 2:
+        case 5:
+        case 8: EXPECT_EQ(3, out[y * output->stride() + x]); break;
+        case 10: EXPECT_EQ(1, out[y * output->stride() + x]); break;
+        default: EXPECT_EQ(0, out[y * output->stride() + x]);
+      }
+    }
+  }
+
+  if (HasFailure()) {
+    output->DumpBuffer();
+  }
+
+  ASSERT_TRUE(output->CheckPadding());
 }
 
 INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_neon));
+#endif  // HAVE_NEON
+
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
                         ::testing::Values(vp8_short_idct4x4llm_mmx));
-#endif
+#endif  // HAVE_MMX
+
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(MSA, IDCTTest,
                         ::testing::Values(vp8_short_idct4x4llm_msa));
-#endif
+#endif  // HAVE_MSA
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_mmi));
+#endif  // HAVE_MMI
 }
diff --git a/libs/libvpx/test/invalid_file_test.cc b/libs/libvpx/test/invalid_file_test.cc
index bebbb141d0..79220b0f69 100644
--- a/libs/libvpx/test/invalid_file_test.cc
+++ b/libs/libvpx/test/invalid_file_test.cc
@@ -45,8 +45,8 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
 
   void OpenResFile(const std::string &res_file_name_) {
     res_file_ = libvpx_test::OpenTestDataFile(res_file_name_);
-    ASSERT_TRUE(res_file_ != NULL) << "Result file open failed. Filename: "
-                                   << res_file_name_;
+    ASSERT_TRUE(res_file_ != NULL)
+        << "Result file open failed. Filename: " << res_file_name_;
   }
 
   virtual bool HandleDecodeResult(
@@ -120,10 +120,23 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
 
 TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
 
+#if CONFIG_VP8_DECODER
+const DecodeParam kVP8InvalidFileTests[] = {
+  { 1, "invalid-bug-1443.ivf" },
+};
+
+VP8_INSTANTIATE_TEST_CASE(InvalidFileTest,
+                          ::testing::ValuesIn(kVP8InvalidFileTests));
+#endif  // CONFIG_VP8_DECODER
+
+#if CONFIG_VP9_DECODER
 const DecodeParam kVP9InvalidFileTests[] = {
   { 1, "invalid-vp90-02-v2.webm" },
 #if CONFIG_VP9_HIGHBITDEPTH
   { 1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf" },
+  { 1,
+    "invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-."
+    "ivf" },
 #endif
   { 1, "invalid-vp90-03-v3.webm" },
   { 1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf" },
@@ -141,10 +154,14 @@ const DecodeParam kVP9InvalidFileTests[] = {
   { 1, "invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf" },
   { 1, "invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf" },
   { 1, "invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf" },
+  { 1,
+    "invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf" },
+  { 1, "invalid-crbug-667044.webm" },
 };
 
 VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
                           ::testing::ValuesIn(kVP9InvalidFileTests));
+#endif  // CONFIG_VP9_DECODER
 
 // This class will include test vectors that are expected to fail
 // peek. However they are still expected to have no fatal failures.
@@ -159,12 +176,12 @@ class InvalidFileInvalidPeekTest : public InvalidFileTest {
 TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { RunTest(); }
 
 #if CONFIG_VP8_DECODER
-const DecodeParam kVP8InvalidFileTests[] = {
+const DecodeParam kVP8InvalidPeekTests[] = {
   { 1, "invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf" },
 };
 
 VP8_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest,
-                          ::testing::ValuesIn(kVP8InvalidFileTests));
+                          ::testing::ValuesIn(kVP8InvalidPeekTests));
 #endif  // CONFIG_VP8_DECODER
 
 #if CONFIG_VP9_DECODER
@@ -184,6 +201,7 @@ const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
     "invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf" },
   { 2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf" },
   { 4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf" },
+  { 2, "invalid-crbug-629481.webm" },
 };
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/libs/libvpx/test/ivf_video_source.h b/libs/libvpx/test/ivf_video_source.h
index b87624a11f..5862d2649f 100644
--- a/libs/libvpx/test/ivf_video_source.h
+++ b/libs/libvpx/test/ivf_video_source.h
@@ -47,8 +47,8 @@ class IVFVideoSource : public CompressedVideoSource {
 
   virtual void Begin() {
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-                                     << file_name_;
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
 
     // Read file header
     uint8_t file_hdr[kIvfFileHdrSize];
diff --git a/libs/libvpx/test/keyframe_test.cc b/libs/libvpx/test/keyframe_test.cc
index 38bd923b7d..ee75f401ca 100644
--- a/libs/libvpx/test/keyframe_test.cc
+++ b/libs/libvpx/test/keyframe_test.cc
@@ -135,8 +135,8 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {
   for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
        iter != kf_pts_list_.end(); ++iter) {
     if (deadline_ == VPX_DL_REALTIME && *iter > 0)
-      EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
-                                     << *iter;
+      EXPECT_EQ(0, (*iter - 1) % 30)
+          << "Unexpected keyframe at frame " << *iter;
     else
       EXPECT_EQ(0, *iter % 30) << "Unexpected keyframe at frame " << *iter;
   }
diff --git a/libs/libvpx/test/level_test.cc b/libs/libvpx/test/level_test.cc
index fbbb539df1..26935a81b0 100644
--- a/libs/libvpx/test/level_test.cc
+++ b/libs/libvpx/test/level_test.cc
@@ -66,6 +66,36 @@ class LevelTest
   int level_;
 };
 
+TEST_P(LevelTest, TestTargetLevel11Large) {
+  ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime);
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       60);
+  target_level_ = 11;
+  cfg_.rc_target_bitrate = 150;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(target_level_, level_);
+}
+
+TEST_P(LevelTest, TestTargetLevel20Large) {
+  ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime);
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 60);
+  target_level_ = 20;
+  cfg_.rc_target_bitrate = 1200;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(target_level_, level_);
+}
+
+TEST_P(LevelTest, TestTargetLevel31Large) {
+  ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime);
+  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720, 30,
+                                       1, 0, 60);
+  target_level_ = 31;
+  cfg_.rc_target_bitrate = 8000;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(target_level_, level_);
+}
+
 // Test for keeping level stats only
 TEST_P(LevelTest, TestTargetLevel0) {
   ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
@@ -73,11 +103,11 @@ TEST_P(LevelTest, TestTargetLevel0) {
   target_level_ = 0;
   min_gf_internal_ = 4;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(11, level_);
+  ASSERT_GE(11, level_);
 
   cfg_.rc_target_bitrate = 1600;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(20, level_);
+  ASSERT_GE(20, level_);
 }
 
 // Test for level control being turned off
@@ -94,12 +124,13 @@ TEST_P(LevelTest, TestTargetLevelApi) {
   vpx_codec_ctx_t enc;
   vpx_codec_enc_cfg_t cfg;
   EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(codec, &cfg, 0));
+  cfg.rc_target_bitrate = 100;
   EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, codec, &cfg, 0));
   for (int level = 0; level <= 256; ++level) {
     if (level == 10 || level == 11 || level == 20 || level == 21 ||
         level == 30 || level == 31 || level == 40 || level == 41 ||
         level == 50 || level == 51 || level == 52 || level == 60 ||
-        level == 61 || level == 62 || level == 0 || level == 255)
+        level == 61 || level == 62 || level == 0 || level == 1 || level == 255)
       EXPECT_EQ(VPX_CODEC_OK,
                 vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level));
     else
diff --git a/libs/libvpx/test/lpf_test.cc b/libs/libvpx/test/lpf_test.cc
index 552b5e33c2..e04b996cd8 100644
--- a/libs/libvpx/test/lpf_test.cc
+++ b/libs/libvpx/test/lpf_test.cc
@@ -114,6 +114,18 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit,
   }
 }
 
+uint8_t GetOuterThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(3 * MAX_LOOP_FILTER + 5));
+}
+
+uint8_t GetInnerThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1));
+}
+
+uint8_t GetHevThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1) >> 4);
+}
+
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
  public:
   virtual ~Loop8Test6Param() {}
@@ -162,15 +174,15 @@ TEST_P(Loop8Test6Param, OperationCheck) {
   int first_failure = -1;
   for (int i = 0; i < count_test_block; ++i) {
     int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -221,15 +233,15 @@ TEST_P(Loop8Test6Param, ValueCheck) {
 
   for (int i = 0; i < count_test_block; ++i) {
     int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -271,27 +283,27 @@ TEST_P(Loop8Test9Param, OperationCheck) {
   int first_failure = -1;
   for (int i = 0; i < count_test_block; ++i) {
     int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -334,27 +346,27 @@ TEST_P(Loop8Test9Param, ValueCheck) {
   int first_failure = -1;
   for (int i = 0; i < count_test_block; ++i) {
     int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -402,10 +414,10 @@ INSTANTIATE_TEST_CASE_P(
                                  &vpx_highbd_lpf_vertical_4_c, 8),
                       make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
                                  &vpx_highbd_lpf_horizontal_8_c, 8),
-                      make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2,
-                                 &vpx_highbd_lpf_horizontal_edge_8_c, 8),
-                      make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2,
-                                 &vpx_highbd_lpf_horizontal_edge_16_c, 8),
+                      make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                                 &vpx_highbd_lpf_horizontal_16_c, 8),
+                      make_tuple(&vpx_highbd_lpf_horizontal_16_dual_sse2,
+                                 &vpx_highbd_lpf_horizontal_16_dual_c, 8),
                       make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
                                  &vpx_highbd_lpf_vertical_8_c, 8),
                       make_tuple(&vpx_highbd_lpf_vertical_16_sse2,
@@ -416,10 +428,10 @@ INSTANTIATE_TEST_CASE_P(
                                  &vpx_highbd_lpf_vertical_4_c, 10),
                       make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
                                  &vpx_highbd_lpf_horizontal_8_c, 10),
-                      make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2,
-                                 &vpx_highbd_lpf_horizontal_edge_8_c, 10),
-                      make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2,
-                                 &vpx_highbd_lpf_horizontal_edge_16_c, 10),
+                      make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                                 &vpx_highbd_lpf_horizontal_16_c, 10),
+                      make_tuple(&vpx_highbd_lpf_horizontal_16_dual_sse2,
+                                 &vpx_highbd_lpf_horizontal_16_dual_c, 10),
                       make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
                                  &vpx_highbd_lpf_vertical_8_c, 10),
                       make_tuple(&vpx_highbd_lpf_vertical_16_sse2,
@@ -430,10 +442,10 @@ INSTANTIATE_TEST_CASE_P(
                                  &vpx_highbd_lpf_vertical_4_c, 12),
                       make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
                                  &vpx_highbd_lpf_horizontal_8_c, 12),
-                      make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2,
-                                 &vpx_highbd_lpf_horizontal_edge_8_c, 12),
-                      make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2,
-                                 &vpx_highbd_lpf_horizontal_edge_16_c, 12),
+                      make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                                 &vpx_highbd_lpf_horizontal_16_c, 12),
+                      make_tuple(&vpx_highbd_lpf_horizontal_16_dual_sse2,
+                                 &vpx_highbd_lpf_horizontal_16_dual_c, 12),
                       make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
                                  &vpx_highbd_lpf_vertical_8_c, 12),
                       make_tuple(&vpx_highbd_lpf_vertical_16_sse2,
@@ -450,10 +462,9 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(
         make_tuple(&vpx_lpf_horizontal_4_sse2, &vpx_lpf_horizontal_4_c, 8),
         make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8),
-        make_tuple(&vpx_lpf_horizontal_edge_8_sse2,
-                   &vpx_lpf_horizontal_edge_8_c, 8),
-        make_tuple(&vpx_lpf_horizontal_edge_16_sse2,
-                   &vpx_lpf_horizontal_edge_16_c, 8),
+        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8),
+        make_tuple(&vpx_lpf_horizontal_16_dual_sse2,
+                   &vpx_lpf_horizontal_16_dual_c, 8),
         make_tuple(&vpx_lpf_vertical_4_sse2, &vpx_lpf_vertical_4_c, 8),
         make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8),
         make_tuple(&vpx_lpf_vertical_16_sse2, &vpx_lpf_vertical_16_c, 8),
@@ -465,10 +476,10 @@ INSTANTIATE_TEST_CASE_P(
 #if HAVE_AVX2 && (!CONFIG_VP9_HIGHBITDEPTH)
 INSTANTIATE_TEST_CASE_P(
     AVX2, Loop8Test6Param,
-    ::testing::Values(make_tuple(&vpx_lpf_horizontal_edge_8_avx2,
-                                 &vpx_lpf_horizontal_edge_8_c, 8),
-                      make_tuple(&vpx_lpf_horizontal_edge_16_avx2,
-                                 &vpx_lpf_horizontal_edge_16_c, 8)));
+    ::testing::Values(make_tuple(&vpx_lpf_horizontal_16_avx2,
+                                 &vpx_lpf_horizontal_16_c, 8),
+                      make_tuple(&vpx_lpf_horizontal_16_dual_avx2,
+                                 &vpx_lpf_horizontal_16_dual_c, 8)));
 #endif
 
 #if HAVE_SSE2
@@ -515,15 +526,89 @@ INSTANTIATE_TEST_CASE_P(
 
 #if HAVE_NEON
 #if CONFIG_VP9_HIGHBITDEPTH
-// No neon high bitdepth functions.
+INSTANTIATE_TEST_CASE_P(
+    NEON, Loop8Test6Param,
+    ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_neon,
+                                 &vpx_highbd_lpf_horizontal_4_c, 8),
+                      make_tuple(&vpx_highbd_lpf_horizontal_4_neon,
+                                 &vpx_highbd_lpf_horizontal_4_c, 10),
+                      make_tuple(&vpx_highbd_lpf_horizontal_4_neon,
+                                 &vpx_highbd_lpf_horizontal_4_c, 12),
+                      make_tuple(&vpx_highbd_lpf_horizontal_8_neon,
+                                 &vpx_highbd_lpf_horizontal_8_c, 8),
+                      make_tuple(&vpx_highbd_lpf_horizontal_8_neon,
+                                 &vpx_highbd_lpf_horizontal_8_c, 10),
+                      make_tuple(&vpx_highbd_lpf_horizontal_8_neon,
+                                 &vpx_highbd_lpf_horizontal_8_c, 12),
+                      make_tuple(&vpx_highbd_lpf_horizontal_16_neon,
+                                 &vpx_highbd_lpf_horizontal_16_c, 8),
+                      make_tuple(&vpx_highbd_lpf_horizontal_16_neon,
+                                 &vpx_highbd_lpf_horizontal_16_c, 10),
+                      make_tuple(&vpx_highbd_lpf_horizontal_16_neon,
+                                 &vpx_highbd_lpf_horizontal_16_c, 12),
+                      make_tuple(&vpx_highbd_lpf_horizontal_16_dual_neon,
+                                 &vpx_highbd_lpf_horizontal_16_dual_c, 8),
+                      make_tuple(&vpx_highbd_lpf_horizontal_16_dual_neon,
+                                 &vpx_highbd_lpf_horizontal_16_dual_c, 10),
+                      make_tuple(&vpx_highbd_lpf_horizontal_16_dual_neon,
+                                 &vpx_highbd_lpf_horizontal_16_dual_c, 12),
+                      make_tuple(&vpx_highbd_lpf_vertical_4_neon,
+                                 &vpx_highbd_lpf_vertical_4_c, 8),
+                      make_tuple(&vpx_highbd_lpf_vertical_4_neon,
+                                 &vpx_highbd_lpf_vertical_4_c, 10),
+                      make_tuple(&vpx_highbd_lpf_vertical_4_neon,
+                                 &vpx_highbd_lpf_vertical_4_c, 12),
+                      make_tuple(&vpx_highbd_lpf_vertical_8_neon,
+                                 &vpx_highbd_lpf_vertical_8_c, 8),
+                      make_tuple(&vpx_highbd_lpf_vertical_8_neon,
+                                 &vpx_highbd_lpf_vertical_8_c, 10),
+                      make_tuple(&vpx_highbd_lpf_vertical_8_neon,
+                                 &vpx_highbd_lpf_vertical_8_c, 12),
+                      make_tuple(&vpx_highbd_lpf_vertical_16_neon,
+                                 &vpx_highbd_lpf_vertical_16_c, 8),
+                      make_tuple(&vpx_highbd_lpf_vertical_16_neon,
+                                 &vpx_highbd_lpf_vertical_16_c, 10),
+                      make_tuple(&vpx_highbd_lpf_vertical_16_neon,
+                                 &vpx_highbd_lpf_vertical_16_c, 12),
+                      make_tuple(&vpx_highbd_lpf_vertical_16_dual_neon,
+                                 &vpx_highbd_lpf_vertical_16_dual_c, 8),
+                      make_tuple(&vpx_highbd_lpf_vertical_16_dual_neon,
+                                 &vpx_highbd_lpf_vertical_16_dual_c, 10),
+                      make_tuple(&vpx_highbd_lpf_vertical_16_dual_neon,
+                                 &vpx_highbd_lpf_vertical_16_dual_c, 12)));
+INSTANTIATE_TEST_CASE_P(
+    NEON, Loop8Test9Param,
+    ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon,
+                                 &vpx_highbd_lpf_horizontal_4_dual_c, 8),
+                      make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon,
+                                 &vpx_highbd_lpf_horizontal_4_dual_c, 10),
+                      make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon,
+                                 &vpx_highbd_lpf_horizontal_4_dual_c, 12),
+                      make_tuple(&vpx_highbd_lpf_horizontal_8_dual_neon,
+                                 &vpx_highbd_lpf_horizontal_8_dual_c, 8),
+                      make_tuple(&vpx_highbd_lpf_horizontal_8_dual_neon,
+                                 &vpx_highbd_lpf_horizontal_8_dual_c, 10),
+                      make_tuple(&vpx_highbd_lpf_horizontal_8_dual_neon,
+                                 &vpx_highbd_lpf_horizontal_8_dual_c, 12),
+                      make_tuple(&vpx_highbd_lpf_vertical_4_dual_neon,
+                                 &vpx_highbd_lpf_vertical_4_dual_c, 8),
+                      make_tuple(&vpx_highbd_lpf_vertical_4_dual_neon,
+                                 &vpx_highbd_lpf_vertical_4_dual_c, 10),
+                      make_tuple(&vpx_highbd_lpf_vertical_4_dual_neon,
+                                 &vpx_highbd_lpf_vertical_4_dual_c, 12),
+                      make_tuple(&vpx_highbd_lpf_vertical_8_dual_neon,
+                                 &vpx_highbd_lpf_vertical_8_dual_c, 8),
+                      make_tuple(&vpx_highbd_lpf_vertical_8_dual_neon,
+                                 &vpx_highbd_lpf_vertical_8_dual_c, 10),
+                      make_tuple(&vpx_highbd_lpf_vertical_8_dual_neon,
+                                 &vpx_highbd_lpf_vertical_8_dual_c, 12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     NEON, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vpx_lpf_horizontal_edge_8_neon,
-                   &vpx_lpf_horizontal_edge_8_c, 8),
-        make_tuple(&vpx_lpf_horizontal_edge_16_neon,
-                   &vpx_lpf_horizontal_edge_16_c, 8),
+        make_tuple(&vpx_lpf_horizontal_16_neon, &vpx_lpf_horizontal_16_c, 8),
+        make_tuple(&vpx_lpf_horizontal_16_dual_neon,
+                   &vpx_lpf_horizontal_16_dual_c, 8),
         make_tuple(&vpx_lpf_vertical_16_neon, &vpx_lpf_vertical_16_c, 8),
         make_tuple(&vpx_lpf_vertical_16_dual_neon, &vpx_lpf_vertical_16_dual_c,
                    8),
@@ -550,8 +635,9 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(
         make_tuple(&vpx_lpf_horizontal_4_dspr2, &vpx_lpf_horizontal_4_c, 8),
         make_tuple(&vpx_lpf_horizontal_8_dspr2, &vpx_lpf_horizontal_8_c, 8),
-        make_tuple(&vpx_lpf_horizontal_edge_8, &vpx_lpf_horizontal_edge_8, 8),
-        make_tuple(&vpx_lpf_horizontal_edge_16, &vpx_lpf_horizontal_edge_16, 8),
+        make_tuple(&vpx_lpf_horizontal_16_dspr2, &vpx_lpf_horizontal_16_c, 8),
+        make_tuple(&vpx_lpf_horizontal_16_dual_dspr2,
+                   &vpx_lpf_horizontal_16_dual_c, 8),
         make_tuple(&vpx_lpf_vertical_4_dspr2, &vpx_lpf_vertical_4_c, 8),
         make_tuple(&vpx_lpf_vertical_8_dspr2, &vpx_lpf_vertical_8_c, 8),
         make_tuple(&vpx_lpf_vertical_16_dspr2, &vpx_lpf_vertical_16_c, 8),
@@ -576,10 +662,9 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(
         make_tuple(&vpx_lpf_horizontal_4_msa, &vpx_lpf_horizontal_4_c, 8),
         make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8),
-        make_tuple(&vpx_lpf_horizontal_edge_8_msa, &vpx_lpf_horizontal_edge_8_c,
-                   8),
-        make_tuple(&vpx_lpf_horizontal_edge_16_msa,
-                   &vpx_lpf_horizontal_edge_16_c, 8),
+        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8),
+        make_tuple(&vpx_lpf_horizontal_16_dual_msa,
+                   &vpx_lpf_horizontal_16_dual_c, 8),
         make_tuple(&vpx_lpf_vertical_4_msa, &vpx_lpf_vertical_4_c, 8),
         make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8),
         make_tuple(&vpx_lpf_vertical_16_msa, &vpx_lpf_vertical_16_c, 8)));
diff --git a/libs/libvpx/test/minmax_test.cc b/libs/libvpx/test/minmax_test.cc
index e51c9fd48c..9c119116a8 100644
--- a/libs/libvpx/test/minmax_test.cc
+++ b/libs/libvpx/test/minmax_test.cc
@@ -107,10 +107,10 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
       int min_ref, max_ref, min, max;
       reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
       ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
-      EXPECT_EQ(max_ref, max) << "when a_stride = " << a_stride
-                              << " and b_stride = " << b_stride;
-      EXPECT_EQ(min_ref, min) << "when a_stride = " << a_stride
-                              << " and b_stride = " << b_stride;
+      EXPECT_EQ(max_ref, max)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+      EXPECT_EQ(min_ref, min)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
     }
   }
 }
@@ -127,4 +127,9 @@ INSTANTIATE_TEST_CASE_P(NEON, MinMaxTest,
                         ::testing::Values(&vpx_minmax_8x8_neon));
 #endif
 
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, MinMaxTest,
+                        ::testing::Values(&vpx_minmax_8x8_msa));
+#endif
+
 }  // namespace
diff --git a/libs/libvpx/test/partial_idct_test.cc b/libs/libvpx/test/partial_idct_test.cc
index de6ff4a01b..f7b50f53a1 100644
--- a/libs/libvpx/test/partial_idct_test.cc
+++ b/libs/libvpx/test/partial_idct_test.cc
@@ -12,6 +12,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include <limits>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vp9_rtcd.h"
@@ -23,235 +25,935 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
+
 typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
 typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, InvTxfmFunc, TX_SIZE, int>
+typedef void (*InvTxfmWithBdFunc)(const tran_low_t *in, uint8_t *out,
+                                  int stride, int bd);
+
+template <InvTxfmFunc fn>
+void wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {
+  (void)bd;
+  fn(in, out, stride);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*InvTxfmHighbdFunc)(const tran_low_t *in, uint16_t *out,
+                                  int stride, int bd);
+template <InvTxfmHighbdFunc fn>
+void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {
+  fn(in, CAST_TO_SHORTPTR(out), stride, bd);
+}
+#endif
+
+typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmWithBdFunc, InvTxfmWithBdFunc,
+                        TX_SIZE, int, int, int>
     PartialInvTxfmParam;
 const int kMaxNumCoeffs = 1024;
+const int kCountTestBlock = 1000;
+
 class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
  public:
   virtual ~PartialIDctTest() {}
   virtual void SetUp() {
-    ftxfm_ = GET_PARAM(0);
-    full_itxfm_ = GET_PARAM(1);
-    partial_itxfm_ = GET_PARAM(2);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    fwd_txfm_ = GET_PARAM(0);
+    full_inv_txfm_ = GET_PARAM(1);
+    partial_inv_txfm_ = GET_PARAM(2);
     tx_size_ = GET_PARAM(3);
     last_nonzero_ = GET_PARAM(4);
+    bit_depth_ = GET_PARAM(5);
+    pixel_size_ = GET_PARAM(6);
+    mask_ = (1 << bit_depth_) - 1;
+
+    switch (tx_size_) {
+      case TX_4X4: size_ = 4; break;
+      case TX_8X8: size_ = 8; break;
+      case TX_16X16: size_ = 16; break;
+      case TX_32X32: size_ = 32; break;
+      default: FAIL() << "Wrong Size!"; break;
+    }
+
+    // Randomize stride_ to a value less than or equal to 1024
+    stride_ = rnd_(1024) + 1;
+    if (stride_ < size_) {
+      stride_ = size_;
+    }
+    // Align stride_ to 16 if it's bigger than 16.
+    if (stride_ > 16) {
+      stride_ &= ~15;
+    }
+
+    input_block_size_ = size_ * size_;
+    output_block_size_ = size_ * stride_;
+
+    input_block_ = reinterpret_cast<tran_low_t *>(
+        vpx_memalign(16, sizeof(*input_block_) * input_block_size_));
+    output_block_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, pixel_size_ * output_block_size_));
+    output_block_ref_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, pixel_size_ * output_block_size_));
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  virtual void TearDown() {
+    vpx_free(input_block_);
+    input_block_ = NULL;
+    vpx_free(output_block_);
+    output_block_ = NULL;
+    vpx_free(output_block_ref_);
+    output_block_ref_ = NULL;
+    libvpx_test::ClearSystemState();
+  }
+
+  void InitMem() {
+    memset(input_block_, 0, sizeof(*input_block_) * input_block_size_);
+    if (pixel_size_ == 1) {
+      for (int j = 0; j < output_block_size_; ++j) {
+        output_block_[j] = output_block_ref_[j] = rnd_.Rand16() & mask_;
+      }
+    } else {
+      ASSERT_EQ(2, pixel_size_);
+      uint16_t *const output = reinterpret_cast<uint16_t *>(output_block_);
+      uint16_t *const output_ref =
+          reinterpret_cast<uint16_t *>(output_block_ref_);
+      for (int j = 0; j < output_block_size_; ++j) {
+        output[j] = output_ref[j] = rnd_.Rand16() & mask_;
+      }
+    }
+  }
+
+  void InitInput() {
+    const int64_t max_coeff = (32766 << (bit_depth_ - 8)) / 4;
+    int64_t max_energy_leftover = max_coeff * max_coeff;
+    for (int j = 0; j < last_nonzero_; ++j) {
+      tran_low_t coeff = static_cast<tran_low_t>(
+          sqrt(1.0 * max_energy_leftover) * (rnd_.Rand16() - 32768) / 65536);
+      max_energy_leftover -= static_cast<int64_t>(coeff) * coeff;
+      if (max_energy_leftover < 0) {
+        max_energy_leftover = 0;
+        coeff = 0;
+      }
+      input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = coeff;
+    }
+  }
+
+  void PrintDiff() {
+    if (memcmp(output_block_ref_, output_block_,
+               pixel_size_ * output_block_size_)) {
+      uint16_t ref, opt;
+      for (int y = 0; y < size_; y++) {
+        for (int x = 0; x < size_; x++) {
+          if (pixel_size_ == 1) {
+            ref = output_block_ref_[y * stride_ + x];
+            opt = output_block_[y * stride_ + x];
+          } else {
+            ref = reinterpret_cast<uint16_t *>(
+                output_block_ref_)[y * stride_ + x];
+            opt = reinterpret_cast<uint16_t *>(output_block_)[y * stride_ + x];
+          }
+          if (ref != opt) {
+            printf("dest[%d][%d] diff:%6d (ref),%6d (opt)\n", y, x, ref, opt);
+          }
+        }
+      }
+
+      printf("\ninput_block_:\n");
+      for (int y = 0; y < size_; y++) {
+        for (int x = 0; x < size_; x++) {
+          printf("%6d,", input_block_[y * size_ + x]);
+        }
+        printf("\n");
+      }
+    }
+  }
 
  protected:
   int last_nonzero_;
   TX_SIZE tx_size_;
-  FwdTxfmFunc ftxfm_;
-  InvTxfmFunc full_itxfm_;
-  InvTxfmFunc partial_itxfm_;
+  tran_low_t *input_block_;
+  uint8_t *output_block_;
+  uint8_t *output_block_ref_;
+  int size_;
+  int stride_;
+  int pixel_size_;
+  int input_block_size_;
+  int output_block_size_;
+  int bit_depth_;
+  int mask_;
+  FwdTxfmFunc fwd_txfm_;
+  InvTxfmWithBdFunc full_inv_txfm_;
+  InvTxfmWithBdFunc partial_inv_txfm_;
+  ACMRandom rnd_;
 };
 
 TEST_P(PartialIDctTest, RunQuantCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int size;
-  switch (tx_size_) {
-    case TX_4X4: size = 4; break;
-    case TX_8X8: size = 8; break;
-    case TX_16X16: size = 16; break;
-    case TX_32X32: size = 32; break;
-    default: FAIL() << "Wrong Size!"; break;
-  }
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
-
-  const int count_test_block = 1000;
-  const int block_size = size * size;
-
+  const int count_test_block = (size_ != 4) ? kCountTestBlock : 65536;
   DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
   DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
 
-  int max_error = 0;
+  InitMem();
+
   for (int i = 0; i < count_test_block; ++i) {
-    // clear out destination buffer
-    memset(dst1, 0, sizeof(*dst1) * block_size);
-    memset(dst2, 0, sizeof(*dst2) * block_size);
-    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
-    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
-
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+    // Initialize a test block with input range [-mask_, mask_].
+    if (size_ != 4) {
       if (i == 0) {
-        for (int j = 0; j < block_size; ++j) input_extreme_block[j] = 255;
+        for (int k = 0; k < input_block_size_; ++k) {
+          input_extreme_block[k] = mask_;
+        }
       } else if (i == 1) {
-        for (int j = 0; j < block_size; ++j) input_extreme_block[j] = -255;
+        for (int k = 0; k < input_block_size_; ++k) {
+          input_extreme_block[k] = -mask_;
+        }
       } else {
-        for (int j = 0; j < block_size; ++j) {
-          input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        for (int k = 0; k < input_block_size_; ++k) {
+          input_extreme_block[k] = rnd_.Rand8() % 2 ? mask_ : -mask_;
         }
       }
-
-      ftxfm_(input_extreme_block, output_ref_block, size);
-
-      // quantization with maximum allowed step sizes
-      test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
-      for (int j = 1; j < last_nonzero_; ++j) {
-        test_coef_block1[vp9_default_scan_orders[tx_size_].scan[j]] =
-            (output_ref_block[j] / 1828) * 1828;
+    } else {
+      // Try all possible combinations.
+      for (int k = 0; k < input_block_size_; ++k) {
+        input_extreme_block[k] = (i & (1 << k)) ? mask_ : -mask_;
       }
     }
 
-    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
-    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
+    fwd_txfm_(input_extreme_block, output_ref_block, size_);
 
-    for (int j = 0; j < block_size; ++j) {
-      const int diff = dst1[j] - dst2[j];
-      const int error = diff * diff;
-      if (max_error < error) max_error = error;
+    // quantization with minimum allowed step sizes
+    input_block_[0] = (output_ref_block[0] / 4) * 4;
+    for (int k = 1; k < last_nonzero_; ++k) {
+      const int pos = vp9_default_scan_orders[tx_size_].scan[k];
+      input_block_[pos] = (output_ref_block[pos] / 4) * 4;
     }
-  }
 
-  EXPECT_EQ(0, max_error)
-      << "Error: partial inverse transform produces different results";
+    ASM_REGISTER_STATE_CHECK(
+        full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+    ASM_REGISTER_STATE_CHECK(
+        partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
+    ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+                        pixel_size_ * output_block_size_))
+        << "Error: partial inverse transform produces different results";
+  }
 }
 
 TEST_P(PartialIDctTest, ResultsMatch) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int size;
-  switch (tx_size_) {
-    case TX_4X4: size = 4; break;
-    case TX_8X8: size = 8; break;
-    case TX_16X16: size = 16; break;
-    case TX_32X32: size = 32; break;
-    default: FAIL() << "Wrong Size!"; break;
+  for (int i = 0; i < kCountTestBlock; ++i) {
+    InitMem();
+    InitInput();
+
+    ASM_REGISTER_STATE_CHECK(
+        full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+    ASM_REGISTER_STATE_CHECK(
+        partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
+    ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+                        pixel_size_ * output_block_size_))
+        << "Error: partial inverse transform produces different results";
   }
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
-  const int count_test_block = 1000;
-  const int max_coeff = 32766 / 4;
-  const int block_size = size * size;
-  int max_error = 0;
-  for (int i = 0; i < count_test_block; ++i) {
-    // clear out destination buffer
-    memset(dst1, 0, sizeof(*dst1) * block_size);
-    memset(dst2, 0, sizeof(*dst2) * block_size);
-    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
-    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
-    int max_energy_leftover = max_coeff * max_coeff;
+}
+
+TEST_P(PartialIDctTest, AddOutputBlock) {
+  for (int i = 0; i < kCountTestBlock; ++i) {
+    InitMem();
     for (int j = 0; j < last_nonzero_; ++j) {
-      int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
-                                          (rnd.Rand16() - 32768) / 65536);
-      max_energy_leftover -= coef * coef;
-      if (max_energy_leftover < 0) {
-        max_energy_leftover = 0;
-        coef = 0;
-      }
-      test_coef_block1[vp9_default_scan_orders[tx_size_].scan[j]] = coef;
+      input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = 10;
     }
 
-    memcpy(test_coef_block2, test_coef_block1,
-           sizeof(*test_coef_block2) * block_size);
+    ASM_REGISTER_STATE_CHECK(
+        full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+    ASM_REGISTER_STATE_CHECK(
+        partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
+    ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+                        pixel_size_ * output_block_size_))
+        << "Error: Transform results are not correctly added to output.";
+  }
+}
 
-    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
-    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
+TEST_P(PartialIDctTest, SingleExtremeCoeff) {
+  const int16_t max_coeff = std::numeric_limits<int16_t>::max();
+  const int16_t min_coeff = std::numeric_limits<int16_t>::min();
+  for (int i = 0; i < last_nonzero_; ++i) {
+    memset(input_block_, 0, sizeof(*input_block_) * input_block_size_);
+    // Run once for min and once for max.
+    for (int j = 0; j < 2; ++j) {
+      const int coeff = j ? min_coeff : max_coeff;
 
-    for (int j = 0; j < block_size; ++j) {
-      const int diff = dst1[j] - dst2[j];
-      const int error = diff * diff;
-      if (max_error < error) max_error = error;
+      memset(output_block_, 0, pixel_size_ * output_block_size_);
+      memset(output_block_ref_, 0, pixel_size_ * output_block_size_);
+      input_block_[vp9_default_scan_orders[tx_size_].scan[i]] = coeff;
+
+      ASM_REGISTER_STATE_CHECK(
+          full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+      ASM_REGISTER_STATE_CHECK(
+          partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
+      ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+                          pixel_size_ * output_block_size_))
+          << "Error: Fails with single coeff of " << coeff << " at " << i
+          << ".";
     }
   }
+}
 
-  EXPECT_EQ(0, max_error)
+TEST_P(PartialIDctTest, DISABLED_Speed) {
+  // Keep runtime stable with transform size.
+  const int kCountSpeedTestBlock = 500000000 / input_block_size_;
+  InitMem();
+  InitInput();
+
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    ASM_REGISTER_STATE_CHECK(
+        full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+  }
+  vpx_usec_timer timer;
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_);
+  }
+  libvpx_test::ClearSystemState();
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time =
+      static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+  printf("idct%dx%d_%d (%s %d) time: %5d ms\n", size_, size_, last_nonzero_,
+         (pixel_size_ == 1) ? "bitdepth" : "high bitdepth", bit_depth_,
+         elapsed_time);
+  ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+                      pixel_size_ * output_block_size_))
       << "Error: partial inverse transform produces different results";
 }
+
 using std::tr1::make_tuple;
 
-INSTANTIATE_TEST_CASE_P(
-    C, PartialIDctTest,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
-                                 &vpx_idct32x32_34_add_c, TX_32X32, 34),
-                      make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
-                                 &vpx_idct32x32_1_add_c, TX_32X32, 1),
-                      make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c,
-                                 &vpx_idct16x16_10_add_c, TX_16X16, 10),
-                      make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c,
-                                 &vpx_idct16x16_1_add_c, TX_16X16, 1),
-                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,
-                                 &vpx_idct8x8_12_add_c, TX_8X8, 12),
-                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,
-                                 &vpx_idct8x8_1_add_c, TX_8X8, 1),
-                      make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,
-                                 &vpx_idct4x4_1_add_c, TX_4X4, 1)));
+const PartialInvTxfmParam c_partial_idct_tests[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>, TX_32X32, 1024, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>, TX_32X32, 1024, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>, TX_32X32, 1024, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>, TX_32X32, 135, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>, TX_32X32, 135, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>, TX_32X32, 135, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>, TX_32X32, 34, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>, TX_32X32, 34, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>, TX_32X32, 34, 12, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>, TX_32X32, 1, 8, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>, TX_32X32, 1, 10, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>, TX_32X32, 1, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>, TX_16X16, 256, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>, TX_16X16, 256, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>, TX_16X16, 256, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, TX_16X16, 38, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, TX_16X16, 38, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, TX_16X16, 38, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>, TX_16X16, 10, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>, TX_16X16, 10, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>, TX_16X16, 10, 12, 2),
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>, TX_16X16, 1, 8, 2),
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>, TX_16X16, 1, 10, 2),
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>, TX_16X16, 1, 12, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>, TX_8X8, 64, 8, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>, TX_8X8, 64, 10, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>, TX_8X8, 64, 12, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>, TX_8X8, 12, 8, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>, TX_8X8, 12, 10, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>, TX_8X8, 12, 12, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>, TX_8X8, 1, 8, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>, TX_8X8, 1, 10, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>, TX_8X8, 1, 12, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c,
+             &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>, TX_4X4, 16, 8, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c,
+             &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>, TX_4X4, 16, 10, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c,
+             &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>, TX_4X4, 16, 12, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c,
+             &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>, TX_4X4, 1, 8, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c,
+             &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>, TX_4X4, 1, 10, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c,
+             &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>, TX_4X4, 1, 12, 2),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+             &wrapper<vpx_idct32x32_1024_add_c>, TX_32X32, 1024, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+             &wrapper<vpx_idct32x32_135_add_c>, TX_32X32, 135, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+             &wrapper<vpx_idct32x32_34_add_c>, TX_32X32, 34, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+             &wrapper<vpx_idct32x32_1_add_c>, TX_32X32, 1, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+             &wrapper<vpx_idct16x16_256_add_c>, TX_16X16, 256, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+             &wrapper<vpx_idct16x16_38_add_c>, TX_16X16, 38, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+             &wrapper<vpx_idct16x16_10_add_c>, TX_16X16, 10, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+             &wrapper<vpx_idct16x16_1_add_c>, TX_16X16, 1, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+             &wrapper<vpx_idct8x8_64_add_c>, TX_8X8, 64, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+             &wrapper<vpx_idct8x8_12_add_c>, TX_8X8, 12, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+             &wrapper<vpx_idct8x8_1_add_c>, TX_8X8, 1, 8, 1),
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
+             &wrapper<vpx_idct4x4_16_add_c>, TX_4X4, 16, 8, 1),
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
+             &wrapper<vpx_idct4x4_1_add_c>, TX_4X4, 1, 8, 1)
+};
 
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    NEON, PartialIDctTest,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
-                                 &vpx_idct32x32_1_add_neon, TX_32X32, 1),
-                      make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c,
-                                 &vpx_idct16x16_10_add_neon, TX_16X16, 10),
-                      make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c,
-                                 &vpx_idct16x16_1_add_neon, TX_16X16, 1),
-                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,
-                                 &vpx_idct8x8_12_add_neon, TX_8X8, 12),
-                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,
-                                 &vpx_idct8x8_1_add_neon, TX_8X8, 1),
-                      make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,
-                                 &vpx_idct4x4_1_add_neon, TX_4X4, 1)));
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(C, PartialIDctTest,
+                        ::testing::ValuesIn(c_partial_idct_tests));
 
-#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    SSE2, PartialIDctTest,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
-                                 &vpx_idct32x32_34_add_sse2, TX_32X32, 34),
-                      make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
-                                 &vpx_idct32x32_1_add_sse2, TX_32X32, 1),
-                      make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c,
-                                 &vpx_idct16x16_10_add_sse2, TX_16X16, 10),
-                      make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c,
-                                 &vpx_idct16x16_1_add_sse2, TX_16X16, 1),
-                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,
-                                 &vpx_idct8x8_12_add_sse2, TX_8X8, 12),
-                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,
-                                 &vpx_idct8x8_1_add_sse2, TX_8X8, 1),
-                      make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,
-                                 &vpx_idct4x4_1_add_sse2, TX_4X4, 1)));
-#endif
+#if !CONFIG_EMULATE_HARDWARE
 
-#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \
-    !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    SSSE3_64, PartialIDctTest,
-    ::testing::Values(make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,
-                                 &vpx_idct8x8_12_add_ssse3, TX_8X8, 12)));
-#endif
+#if HAVE_NEON
+const PartialInvTxfmParam neon_partial_idct_tests[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_neon>, TX_32X32,
+             1024, 8, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_neon>, TX_32X32,
+             1024, 10, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_neon>, TX_32X32,
+             1024, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_neon>, TX_32X32, 135, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_neon>, TX_32X32, 135, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_neon>, TX_32X32, 135, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_neon>, TX_32X32, 34, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_neon>, TX_32X32, 34, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_neon>, TX_32X32, 34, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_256_add_neon>, TX_16X16, 256, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_256_add_neon>, TX_16X16, 256, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_256_add_neon>, TX_16X16, 256, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_neon>, TX_16X16, 38, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_neon>, TX_16X16, 38, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_neon>, TX_16X16, 38, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_neon>, TX_16X16, 10, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_neon>, TX_16X16, 10, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_neon>, TX_16X16, 10, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 12, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_neon>, TX_8X8, 64, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_64_add_neon>, TX_8X8, 64, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_64_add_neon>, TX_8X8, 64, 12, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c,
+             &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_12_add_neon>, TX_8X8, 12, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_12_add_neon>, TX_8X8, 12, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_12_add_neon>, TX_8X8, 12, 12, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_neon>, TX_8X8, 1, 8, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_neon>, TX_8X8, 1, 10, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_neon>, TX_8X8, 1, 12, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c,
+             &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_16_add_neon>, TX_4X4, 16, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_neon>, TX_4X4, 16, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_neon>, TX_4X4, 16, 12, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_neon>, TX_4X4, 1, 8, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_neon>, TX_4X4, 1, 10, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_neon>, TX_4X4, 1, 12, 2),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+             &wrapper<vpx_idct32x32_1024_add_neon>, TX_32X32, 1024, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>,
+             &wrapper<vpx_idct32x32_135_add_neon>, TX_32X32, 135, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
+             &wrapper<vpx_idct32x32_34_add_neon>, TX_32X32, 34, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
+             &wrapper<vpx_idct32x32_1_add_neon>, TX_32X32, 1, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+             &wrapper<vpx_idct16x16_256_add_neon>, TX_16X16, 256, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_38_add_c>,
+             &wrapper<vpx_idct16x16_38_add_neon>, TX_16X16, 38, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>,
+             &wrapper<vpx_idct16x16_10_add_neon>, TX_16X16, 10, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>,
+             &wrapper<vpx_idct16x16_1_add_neon>, TX_16X16, 1, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+             &wrapper<vpx_idct8x8_64_add_neon>, TX_8X8, 64, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
+             &wrapper<vpx_idct8x8_12_add_neon>, TX_8X8, 12, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_1_add_c>,
+             &wrapper<vpx_idct8x8_1_add_neon>, TX_8X8, 1, 8, 1),
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
+             &wrapper<vpx_idct4x4_16_add_neon>, TX_4X4, 16, 8, 1),
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_1_add_c>,
+             &wrapper<vpx_idct4x4_1_add_neon>, TX_4X4, 1, 8, 1)
+};
 
-#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    MSA, PartialIDctTest,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
-                                 &vpx_idct32x32_34_add_msa, TX_32X32, 34),
-                      make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
-                                 &vpx_idct32x32_1_add_msa, TX_32X32, 1),
-                      make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c,
-                                 &vpx_idct16x16_10_add_msa, TX_16X16, 10),
-                      make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c,
-                                 &vpx_idct16x16_1_add_msa, TX_16X16, 1),
-                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,
-                                 &vpx_idct8x8_12_add_msa, TX_8X8, 10),
-                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,
-                                 &vpx_idct8x8_1_add_msa, TX_8X8, 1),
-                      make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,
-                                 &vpx_idct4x4_1_add_msa, TX_4X4, 1)));
-#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest,
+                        ::testing::ValuesIn(neon_partial_idct_tests));
+#endif  // HAVE_NEON
+
+#if HAVE_SSE2
+// 32x32_135_ is implemented using the 1024 version.
+const PartialInvTxfmParam sse2_partial_idct_tests[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32,
+             1024, 8, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32,
+             1024, 10, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32,
+             1024, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_sse2>, TX_32X32, 1, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_sse2>, TX_32X32, 1, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse2>, TX_16X16, 256, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse2>, TX_16X16, 256, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse2>, TX_16X16, 256, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 12, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse2>, TX_8X8, 64, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse2>, TX_8X8, 64, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse2>, TX_8X8, 64, 12, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c,
+             &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse2>, TX_8X8, 12, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse2>, TX_8X8, 12, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse2>, TX_8X8, 12, 12, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 8, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 10, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 12, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c,
+             &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 12, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 8, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 10, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 12, 2),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+             &wrapper<vpx_idct32x32_1024_add_sse2>, TX_32X32, 1024, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>,
+             &wrapper<vpx_idct32x32_135_add_sse2>, TX_32X32, 135, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
+             &wrapper<vpx_idct32x32_34_add_sse2>, TX_32X32, 34, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
+             &wrapper<vpx_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+             &wrapper<vpx_idct16x16_256_add_sse2>, TX_16X16, 256, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_38_add_c>,
+             &wrapper<vpx_idct16x16_38_add_sse2>, TX_16X16, 38, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>,
+             &wrapper<vpx_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>,
+             &wrapper<vpx_idct16x16_1_add_sse2>, TX_16X16, 1, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+             &wrapper<vpx_idct8x8_64_add_sse2>, TX_8X8, 64, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
+             &wrapper<vpx_idct8x8_12_add_sse2>, TX_8X8, 12, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_1_add_c>,
+             &wrapper<vpx_idct8x8_1_add_sse2>, TX_8X8, 1, 8, 1),
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
+             &wrapper<vpx_idct4x4_16_add_sse2>, TX_4X4, 16, 8, 1),
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_1_add_c>,
+             &wrapper<vpx_idct4x4_1_add_sse2>, TX_4X4, 1, 8, 1)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest,
+                        ::testing::ValuesIn(sse2_partial_idct_tests));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+const PartialInvTxfmParam ssse3_partial_idct_tests[] = {
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>,
+             &wrapper<vpx_idct32x32_135_add_ssse3>, TX_32X32, 135, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
+             &wrapper<vpx_idct32x32_34_add_ssse3>, TX_32X32, 34, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
+             &wrapper<vpx_idct8x8_12_add_ssse3>, TX_8X8, 12, 8, 1)
+};
+
+INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest,
+                        ::testing::ValuesIn(ssse3_partial_idct_tests));
+#endif  // HAVE_SSSE3
+
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+const PartialInvTxfmParam sse4_1_partial_idct_tests[] = {
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32,
+             1024, 8, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32,
+             1024, 10, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32,
+             1024, 12, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32,
+             135, 8, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32,
+             135, 10, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32,
+             135, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 12, 2),
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+             256, 8, 2),
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+             256, 10, 2),
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+             256, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 12, 2)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, PartialIDctTest,
+                        ::testing::ValuesIn(sse4_1_partial_idct_tests));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
+const PartialInvTxfmParam dspr2_partial_idct_tests[] = {
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+             &wrapper<vpx_idct32x32_1024_add_dspr2>, TX_32X32, 1024, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
+             &wrapper<vpx_idct32x32_34_add_dspr2>, TX_32X32, 34, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
+             &wrapper<vpx_idct32x32_1_add_dspr2>, TX_32X32, 1, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+             &wrapper<vpx_idct16x16_256_add_dspr2>, TX_16X16, 256, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>,
+             &wrapper<vpx_idct16x16_10_add_dspr2>, TX_16X16, 10, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>,
+             &wrapper<vpx_idct16x16_1_add_dspr2>, TX_16X16, 1, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+             &wrapper<vpx_idct8x8_64_add_dspr2>, TX_8X8, 64, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
+             &wrapper<vpx_idct8x8_12_add_dspr2>, TX_8X8, 12, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_1_add_c>,
+             &wrapper<vpx_idct8x8_1_add_dspr2>, TX_8X8, 1, 8, 1),
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
+             &wrapper<vpx_idct4x4_16_add_dspr2>, TX_4X4, 16, 8, 1),
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_1_add_c>,
+             &wrapper<vpx_idct4x4_1_add_dspr2>, TX_4X4, 1, 8, 1)
+};
+
+INSTANTIATE_TEST_CASE_P(DSPR2, PartialIDctTest,
+                        ::testing::ValuesIn(dspr2_partial_idct_tests));
+#endif  // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
+// 32x32_135_ is implemented using the 1024 version.
+const PartialInvTxfmParam msa_partial_idct_tests[] = {
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+             &wrapper<vpx_idct32x32_1024_add_msa>, TX_32X32, 1024, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
+             &wrapper<vpx_idct32x32_34_add_msa>, TX_32X32, 34, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
+             &wrapper<vpx_idct32x32_1_add_msa>, TX_32X32, 1, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+             &wrapper<vpx_idct16x16_256_add_msa>, TX_16X16, 256, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>,
+             &wrapper<vpx_idct16x16_10_add_msa>, TX_16X16, 10, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>,
+             &wrapper<vpx_idct16x16_1_add_msa>, TX_16X16, 1, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+             &wrapper<vpx_idct8x8_64_add_msa>, TX_8X8, 64, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
+             &wrapper<vpx_idct8x8_12_add_msa>, TX_8X8, 12, 8, 1),
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_1_add_c>,
+             &wrapper<vpx_idct8x8_1_add_msa>, TX_8X8, 1, 8, 1),
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
+             &wrapper<vpx_idct4x4_16_add_msa>, TX_4X4, 16, 8, 1),
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_1_add_c>,
+             &wrapper<vpx_idct4x4_1_add_msa>, TX_4X4, 1, 8, 1)
+};
+
+INSTANTIATE_TEST_CASE_P(MSA, PartialIDctTest,
+                        ::testing::ValuesIn(msa_partial_idct_tests));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // !CONFIG_EMULATE_HARDWARE
 
 }  // namespace
diff --git a/libs/libvpx/test/pp_filter_test.cc b/libs/libvpx/test/pp_filter_test.cc
index 2e34fed065..5a2ade1ef4 100644
--- a/libs/libvpx/test/pp_filter_test.cc
+++ b/libs/libvpx/test/pp_filter_test.cc
@@ -7,22 +7,42 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <limits.h>
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 
-typedef void (*PostProcFunc)(unsigned char *src_ptr, unsigned char *dst_ptr,
-                             int src_pixels_per_line, int dst_pixels_per_line,
-                             int cols, unsigned char *flimit, int size);
+using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
+
+typedef void (*VpxPostProcDownAndAcrossMbRowFunc)(
+    unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line,
+    int dst_pixels_per_line, int cols, unsigned char *flimit, int size);
+
+typedef void (*VpxMbPostProcAcrossIpFunc)(unsigned char *src, int pitch,
+                                          int rows, int cols, int flimit);
+
+typedef void (*VpxMbPostProcDownFunc)(unsigned char *dst, int pitch, int rows,
+                                      int cols, int flimit);
 
 namespace {
 
-class VPxPostProcessingFilterTest
-    : public ::testing::TestWithParam<PostProcFunc> {
+// Compute the filter level used in post proc from the loop filter strength
+int q2mbl(int x) {
+  if (x < 20) x = 20;
+
+  x = 50 + (x - 50) * 10 / 8;
+  return x * x / 3;
+}
+
+class VpxPostProcDownAndAcrossMbRowTest
+    : public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> {
  public:
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 };
@@ -30,31 +50,22 @@ class VPxPostProcessingFilterTest
 // Test routine for the VPx post-processing function
 // vpx_post_proc_down_and_across_mb_row_c.
 
-TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
+TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
   // Size of the underlying data block that will be filtered.
   const int block_width = 16;
   const int block_height = 16;
 
   // 5-tap filter needs 2 padding rows above and below the block in the input.
-  const int input_width = block_width;
-  const int input_height = block_height + 4;
-  const int input_stride = input_width;
-  const int input_size = input_width * input_height;
+  Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2);
+  ASSERT_TRUE(src_image.Init());
 
   // Filter extends output block by 8 samples at left and right edges.
-  const int output_width = block_width + 16;
-  const int output_height = block_height;
-  const int output_stride = output_width;
-  const int output_size = output_width * output_height;
+  // Though the left padding is only 8 bytes, the assembly code tries to
+  // read 16 bytes before the pointer.
+  Buffer<uint8_t> dst_image =
+      Buffer<uint8_t>(block_width, block_height, 8, 16, 8, 8);
+  ASSERT_TRUE(dst_image.Init());
 
-  uint8_t *const src_image =
-      reinterpret_cast<uint8_t *>(vpx_calloc(input_size, 1));
-  uint8_t *const dst_image =
-      reinterpret_cast<uint8_t *>(vpx_calloc(output_size, 1));
-
-  // Pointers to top-left pixel of block in the input and output images.
-  uint8_t *const src_image_ptr = src_image + (input_stride << 1);
-  uint8_t *const dst_image_ptr = dst_image + 8;
   uint8_t *const flimits =
       reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
   (void)memset(flimits, 255, block_width);
@@ -62,53 +73,412 @@ TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
   // Initialize pixels in the input:
   //   block pixels to value 1,
   //   border pixels to value 10.
-  (void)memset(src_image, 10, input_size);
-  uint8_t *pixel_ptr = src_image_ptr;
-  for (int i = 0; i < block_height; ++i) {
-    for (int j = 0; j < block_width; ++j) {
-      pixel_ptr[j] = 1;
-    }
-    pixel_ptr += input_stride;
-  }
+  src_image.SetPadding(10);
+  src_image.Set(1);
 
   // Initialize pixels in the output to 99.
-  (void)memset(dst_image, 99, output_size);
+  dst_image.Set(99);
 
-  ASM_REGISTER_STATE_CHECK(GetParam()(src_image_ptr, dst_image_ptr,
-                                      input_stride, output_stride, block_width,
-                                      flimits, 16));
+  ASM_REGISTER_STATE_CHECK(GetParam()(
+      src_image.TopLeftPixel(), dst_image.TopLeftPixel(), src_image.stride(),
+      dst_image.stride(), block_width, flimits, 16));
 
-  static const uint8_t expected_data[block_height] = { 4, 3, 1, 1, 1, 1, 1, 1,
-                                                       1, 1, 1, 1, 1, 1, 3, 4 };
+  static const uint8_t kExpectedOutput[block_height] = {
+    4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
+  };
 
-  pixel_ptr = dst_image_ptr;
+  uint8_t *pixel_ptr = dst_image.TopLeftPixel();
   for (int i = 0; i < block_height; ++i) {
     for (int j = 0; j < block_width; ++j) {
-      EXPECT_EQ(expected_data[i], pixel_ptr[j])
-          << "VPxPostProcessingFilterTest failed with invalid filter output";
+      ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j])
+          << "at (" << i << ", " << j << ")";
     }
-    pixel_ptr += output_stride;
+    pixel_ptr += dst_image.stride();
   }
 
-  vpx_free(src_image);
-  vpx_free(dst_image);
   vpx_free(flimits);
 };
 
+TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
+  // Size of the underlying data block that will be filtered.
+  // Y blocks are always a multiple of 16 wide and exactly 16 high. U and V
+  // blocks are always a multiple of 8 wide and exactly 8 high.
+  const int block_width = 136;
+  const int block_height = 16;
+
+  // 5-tap filter needs 2 padding rows above and below the block in the input.
+  // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
+  Buffer<uint8_t> src_image =
+      Buffer<uint8_t>(block_width, block_height, 2, 2, 10, 2);
+  ASSERT_TRUE(src_image.Init());
+
+  // Filter extends output block by 8 samples at left and right edges.
+  // Though the left padding is only 8 bytes, there is 'above' padding as well
+  // so when the assembly code tries to read 16 bytes before the pointer it is
+  // not a problem.
+  // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
+  Buffer<uint8_t> dst_image =
+      Buffer<uint8_t>(block_width, block_height, 8, 8, 16, 8);
+  ASSERT_TRUE(dst_image.Init());
+  Buffer<uint8_t> dst_image_ref = Buffer<uint8_t>(block_width, block_height, 8);
+  ASSERT_TRUE(dst_image_ref.Init());
+
+  // Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock
+  // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so
+  // it must be padded out.
+  const int flimits_width = block_width % 16 ? block_width + 8 : block_width;
+  uint8_t *const flimits =
+      reinterpret_cast<uint8_t *>(vpx_memalign(16, flimits_width));
+
+  ACMRandom rnd;
+  rnd.Reset(ACMRandom::DeterministicSeed());
+  // Initialize pixels in the input:
+  //   block pixels to random values.
+  //   border pixels to value 10.
+  src_image.SetPadding(10);
+  src_image.Set(&rnd, &ACMRandom::Rand8);
+
+  for (int blocks = 0; blocks < block_width; blocks += 8) {
+    (void)memset(flimits, 0, sizeof(*flimits) * flimits_width);
+
+    for (int f = 0; f < 255; f++) {
+      (void)memset(flimits + blocks, f, sizeof(*flimits) * 8);
+
+      dst_image.Set(0);
+      dst_image_ref.Set(0);
+
+      vpx_post_proc_down_and_across_mb_row_c(
+          src_image.TopLeftPixel(), dst_image_ref.TopLeftPixel(),
+          src_image.stride(), dst_image_ref.stride(), block_width, flimits,
+          block_height);
+      ASM_REGISTER_STATE_CHECK(
+          GetParam()(src_image.TopLeftPixel(), dst_image.TopLeftPixel(),
+                     src_image.stride(), dst_image.stride(), block_width,
+                     flimits, block_height));
+
+      ASSERT_TRUE(dst_image.CheckValues(dst_image_ref));
+    }
+  }
+
+  vpx_free(flimits);
+}
+
+class VpxMbPostProcAcrossIpTest
+    : public ::testing::TestWithParam<VpxMbPostProcAcrossIpFunc> {
+ public:
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void SetCols(unsigned char *s, int rows, int cols, int src_width) {
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < cols; c++) {
+        s[c] = c;
+      }
+      s += src_width;
+    }
+  }
+
+  void RunComparison(const unsigned char *expected_output, unsigned char *src_c,
+                     int rows, int cols, int src_pitch) {
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < cols; c++) {
+        ASSERT_EQ(expected_output[c], src_c[c])
+            << "at (" << r << ", " << c << ")";
+      }
+      src_c += src_pitch;
+    }
+  }
+
+  void RunFilterLevel(unsigned char *s, int rows, int cols, int src_width,
+                      int filter_level, const unsigned char *expected_output) {
+    ASM_REGISTER_STATE_CHECK(
+        GetParam()(s, src_width, rows, cols, filter_level));
+    RunComparison(expected_output, s, rows, cols, src_width);
+  }
+};
+
+TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) {
+  const int rows = 16;
+  const int cols = 16;
+
+  Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(src.Init());
+  src.SetPadding(10);
+  SetCols(src.TopLeftPixel(), rows, cols, src.stride());
+
+  Buffer<uint8_t> expected_output = Buffer<uint8_t>(cols, rows, 0);
+  ASSERT_TRUE(expected_output.Init());
+  SetCols(expected_output.TopLeftPixel(), rows, cols, expected_output.stride());
+
+  RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(0),
+                 expected_output.TopLeftPixel());
+}
+
+TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) {
+  const int rows = 16;
+  const int cols = 16;
+
+  Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(src.Init());
+  src.SetPadding(10);
+  SetCols(src.TopLeftPixel(), rows, cols, src.stride());
+
+  static const unsigned char kExpectedOutput[cols] = {
+    2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 13
+  };
+
+  RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(70),
+                 kExpectedOutput);
+}
+
+TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) {
+  const int rows = 16;
+  const int cols = 16;
+
+  Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(src.Init());
+  src.SetPadding(10);
+  SetCols(src.TopLeftPixel(), rows, cols, src.stride());
+
+  static const unsigned char kExpectedOutput[cols] = {
+    2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 13
+  };
+
+  RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), INT_MAX,
+                 kExpectedOutput);
+
+  SetCols(src.TopLeftPixel(), rows, cols, src.stride());
+
+  RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(100),
+                 kExpectedOutput);
+}
+
+TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) {
+  const int rows = 16;
+  const int cols = 16;
+
+  Buffer<uint8_t> c_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(c_mem.Init());
+  Buffer<uint8_t> asm_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(asm_mem.Init());
+
+  // When level >= 100, the filter behaves the same as the level = INT_MAX
+  // When level < 20, it behaves the same as the level = 0
+  for (int level = 0; level < 100; level++) {
+    c_mem.SetPadding(10);
+    asm_mem.SetPadding(10);
+    SetCols(c_mem.TopLeftPixel(), rows, cols, c_mem.stride());
+    SetCols(asm_mem.TopLeftPixel(), rows, cols, asm_mem.stride());
+
+    vpx_mbpost_proc_across_ip_c(c_mem.TopLeftPixel(), c_mem.stride(), rows,
+                                cols, q2mbl(level));
+    ASM_REGISTER_STATE_CHECK(GetParam()(
+        asm_mem.TopLeftPixel(), asm_mem.stride(), rows, cols, q2mbl(level)));
+
+    ASSERT_TRUE(asm_mem.CheckValues(c_mem));
+  }
+}
+
+class VpxMbPostProcDownTest
+    : public ::testing::TestWithParam<VpxMbPostProcDownFunc> {
+ public:
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void SetRows(unsigned char *src_c, int rows, int cols, int src_width) {
+    for (int r = 0; r < rows; r++) {
+      memset(src_c, r, cols);
+      src_c += src_width;
+    }
+  }
+
+  void RunComparison(const unsigned char *expected_output, unsigned char *src_c,
+                     int rows, int cols, int src_pitch) {
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < cols; c++) {
+        ASSERT_EQ(expected_output[r * rows + c], src_c[c])
+            << "at (" << r << ", " << c << ")";
+      }
+      src_c += src_pitch;
+    }
+  }
+
+  void RunFilterLevel(unsigned char *s, int rows, int cols, int src_width,
+                      int filter_level, const unsigned char *expected_output) {
+    ASM_REGISTER_STATE_CHECK(
+        GetParam()(s, src_width, rows, cols, filter_level));
+    RunComparison(expected_output, s, rows, cols, src_width);
+  }
+};
+
+TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) {
+  const int rows = 16;
+  const int cols = 16;
+
+  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
+  src_c.SetPadding(10);
+
+  SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
+
+  static const unsigned char kExpectedOutput[rows * cols] = {
+    2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  2,
+    2,  3,  2,  2,  2,  2,  2,  2,  2,  3,  2,  2,  2,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  3,  4,  4,  3,  3,  3,
+    4,  4,  3,  4,  4,  3,  3,  4,  5,  4,  4,  4,  4,  4,  4,  4,  5,  4,  4,
+    4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  8,  9,  9,  8,  8,  8,  9,
+    9,  8,  9,  9,  8,  8,  8,  9,  9,  10, 10, 9,  9,  9,  10, 10, 9,  10, 10,
+    9,  9,  9,  10, 10, 10, 11, 10, 10, 10, 11, 10, 11, 10, 11, 10, 10, 10, 11,
+    10, 11, 11, 11, 11, 11, 11, 11, 12, 11, 11, 11, 11, 11, 11, 11, 12, 11, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 12,
+    13, 12, 13, 12, 12, 12, 13, 12, 13, 12, 13, 12, 13, 13, 13, 14, 13, 13, 13,
+    13, 13, 13, 13, 14, 13, 13, 13, 13
+  };
+
+  RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), INT_MAX,
+                 kExpectedOutput);
+
+  src_c.SetPadding(10);
+  SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
+  RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(100),
+                 kExpectedOutput);
+}
+
+TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) {
+  const int rows = 16;
+  const int cols = 16;
+
+  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
+  src_c.SetPadding(10);
+
+  SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
+
+  static const unsigned char kExpectedOutput[rows * cols] = {
+    2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  2,
+    2,  3,  2,  2,  2,  2,  2,  2,  2,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,
+    9,  9,  9,  9,  9,  9,  9,  9,  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 13, 12,
+    13, 12, 13, 12, 12, 12, 13, 12, 13, 12, 13, 12, 13, 13, 13, 14, 13, 13, 13,
+    13, 13, 13, 13, 14, 13, 13, 13, 13
+  };
+
+  RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(70),
+                 kExpectedOutput);
+}
+
+TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) {
+  const int rows = 16;
+  const int cols = 16;
+
+  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
+  src_c.SetPadding(10);
+
+  SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
+
+  unsigned char *expected_output = new unsigned char[rows * cols];
+  ASSERT_TRUE(expected_output != NULL);
+  SetRows(expected_output, rows, cols, cols);
+
+  RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(0),
+                 expected_output);
+
+  delete[] expected_output;
+}
+
+TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {
+  const int rows = 16;
+  const int cols = 16;
+
+  ACMRandom rnd;
+  rnd.Reset(ACMRandom::DeterministicSeed());
+
+  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
+  Buffer<uint8_t> src_asm = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_asm.Init());
+
+  for (int level = 0; level < 100; level++) {
+    src_c.SetPadding(10);
+    src_asm.SetPadding(10);
+    src_c.Set(&rnd, &ACMRandom::Rand8);
+    src_asm.CopyFrom(src_c);
+
+    vpx_mbpost_proc_down_c(src_c.TopLeftPixel(), src_c.stride(), rows, cols,
+                           q2mbl(level));
+    ASM_REGISTER_STATE_CHECK(GetParam()(
+        src_asm.TopLeftPixel(), src_asm.stride(), rows, cols, q2mbl(level)));
+    ASSERT_TRUE(src_asm.CheckValues(src_c));
+
+    src_c.SetPadding(10);
+    src_asm.SetPadding(10);
+    src_c.Set(&rnd, &ACMRandom::Rand8Extremes);
+    src_asm.CopyFrom(src_c);
+
+    vpx_mbpost_proc_down_c(src_c.TopLeftPixel(), src_c.stride(), rows, cols,
+                           q2mbl(level));
+    ASM_REGISTER_STATE_CHECK(GetParam()(
+        src_asm.TopLeftPixel(), src_asm.stride(), rows, cols, q2mbl(level)));
+    ASSERT_TRUE(src_asm.CheckValues(src_c));
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(
-    C, VPxPostProcessingFilterTest,
+    C, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_c));
 
+INSTANTIATE_TEST_CASE_P(C, VpxMbPostProcAcrossIpTest,
+                        ::testing::Values(vpx_mbpost_proc_across_ip_c));
+
+INSTANTIATE_TEST_CASE_P(C, VpxMbPostProcDownTest,
+                        ::testing::Values(vpx_mbpost_proc_down_c));
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
-    SSE2, VPxPostProcessingFilterTest,
+    SSE2, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2));
-#endif
+
+INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcAcrossIpTest,
+                        ::testing::Values(vpx_mbpost_proc_across_ip_sse2));
+
+INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcDownTest,
+                        ::testing::Values(vpx_mbpost_proc_down_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, VpxPostProcDownAndAcrossMbRowTest,
+    ::testing::Values(vpx_post_proc_down_and_across_mb_row_neon));
+
+INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcAcrossIpTest,
+                        ::testing::Values(vpx_mbpost_proc_across_ip_neon));
+
+INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcDownTest,
+                        ::testing::Values(vpx_mbpost_proc_down_neon));
+#endif  // HAVE_NEON
 
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(
-    MSA, VPxPostProcessingFilterTest,
+    MSA, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_msa));
-#endif
+
+INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcAcrossIpTest,
+                        ::testing::Values(vpx_mbpost_proc_across_ip_msa));
+
+INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest,
+                        ::testing::Values(vpx_mbpost_proc_down_msa));
+#endif  // HAVE_MSA
 
 }  // namespace
diff --git a/libs/libvpx/test/predict_test.cc b/libs/libvpx/test/predict_test.cc
index 07841a1176..9f366ae529 100644
--- a/libs/libvpx/test/predict_test.cc
+++ b/libs/libvpx/test/predict_test.cc
@@ -292,15 +292,13 @@ INSTANTIATE_TEST_CASE_P(
     NEON, SixtapPredictTest,
     ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),
                       make_tuple(8, 8, &vp8_sixtap_predict8x8_neon),
-                      make_tuple(8, 4, &vp8_sixtap_predict8x4_neon)));
+                      make_tuple(8, 4, &vp8_sixtap_predict8x4_neon),
+                      make_tuple(4, 4, &vp8_sixtap_predict4x4_neon)));
 #endif
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(
     MMX, SixtapPredictTest,
-    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmx),
-                      make_tuple(8, 8, &vp8_sixtap_predict8x8_mmx),
-                      make_tuple(8, 4, &vp8_sixtap_predict8x4_mmx),
-                      make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx)));
+    ::testing::Values(make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx)));
 #endif
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
@@ -326,6 +324,15 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(4, 4, &vp8_sixtap_predict4x4_msa)));
 #endif
 
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(
+    MMI, SixtapPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmi),
+                      make_tuple(8, 8, &vp8_sixtap_predict8x8_mmi),
+                      make_tuple(8, 4, &vp8_sixtap_predict8x4_mmi),
+                      make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi)));
+#endif
+
 class BilinearPredictTest : public PredictTestBase {};
 
 TEST_P(BilinearPredictTest, TestWithRandomData) {
@@ -352,9 +359,7 @@ INSTANTIATE_TEST_CASE_P(
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(
     MMX, BilinearPredictTest,
-    ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_mmx),
-                      make_tuple(8, 8, &vp8_bilinear_predict8x8_mmx),
-                      make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx),
+    ::testing::Values(make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx),
                       make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx)));
 #endif
 #if HAVE_SSE2
diff --git a/libs/libvpx/test/quantize_test.cc b/libs/libvpx/test/quantize_test.cc
index 69da8994ca..40bb2642e4 100644
--- a/libs/libvpx/test/quantize_test.cc
+++ b/libs/libvpx/test/quantize_test.cc
@@ -200,4 +200,12 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c),
         make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c)));
 #endif  // HAVE_MSA
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(
+    MMI, QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c),
+        make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c)));
+#endif  // HAVE_MMI
 }  // namespace
diff --git a/libs/libvpx/test/register_state_check.h b/libs/libvpx/test/register_state_check.h
index b8ba34328c..a779e5c06a 100644
--- a/libs/libvpx/test/register_state_check.h
+++ b/libs/libvpx/test/register_state_check.h
@@ -32,7 +32,9 @@
 
 #undef NOMINMAX
 #define NOMINMAX
+#ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 #include <winnt.h>
 
@@ -111,8 +113,8 @@ class RegisterStateCheck {
     int64_t post_store[8];
     vpx_push_neon(post_store);
     for (int i = 0; i < 8; ++i) {
-      EXPECT_EQ(pre_store_[i], post_store[i]) << "d" << i + 8
-                                              << " has been modified";
+      EXPECT_EQ(pre_store_[i], post_store[i])
+          << "d" << i + 8 << " has been modified";
     }
   }
 
diff --git a/libs/libvpx/test/resize_test.cc b/libs/libvpx/test/resize_test.cc
index c9950dd433..e95dc6651a 100644
--- a/libs/libvpx/test/resize_test.cc
+++ b/libs/libvpx/test/resize_test.cc
@@ -298,10 +298,10 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
     unsigned int expected_h;
     ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
                         &expected_h, 0);
-    EXPECT_EQ(expected_w, info->w) << "Frame " << frame
-                                   << " had unexpected width";
-    EXPECT_EQ(expected_h, info->h) << "Frame " << frame
-                                   << " had unexpected height";
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
   }
 }
 
@@ -513,10 +513,10 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
     unsigned int expected_h;
     ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
                         &expected_h, 1);
-    EXPECT_EQ(expected_w, info->w) << "Frame " << frame
-                                   << " had unexpected width";
-    EXPECT_EQ(expected_h, info->h) << "Frame " << frame
-                                   << " had unexpected height";
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
     EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
   }
 }
diff --git a/libs/libvpx/test/sad_test.cc b/libs/libvpx/test/sad_test.cc
index 837b08fbdf..67c3c53150 100644
--- a/libs/libvpx/test/sad_test.cc
+++ b/libs/libvpx/test/sad_test.cc
@@ -644,19 +644,50 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 #if HAVE_NEON
 const SadMxNParam neon_tests[] = {
   SadMxNParam(64, 64, &vpx_sad64x64_neon),
+  SadMxNParam(64, 32, &vpx_sad64x32_neon),
   SadMxNParam(32, 32, &vpx_sad32x32_neon),
+  SadMxNParam(16, 32, &vpx_sad16x32_neon),
   SadMxNParam(16, 16, &vpx_sad16x16_neon),
   SadMxNParam(16, 8, &vpx_sad16x8_neon),
   SadMxNParam(8, 16, &vpx_sad8x16_neon),
   SadMxNParam(8, 8, &vpx_sad8x8_neon),
+  SadMxNParam(8, 4, &vpx_sad8x4_neon),
+  SadMxNParam(4, 8, &vpx_sad4x8_neon),
   SadMxNParam(4, 4, &vpx_sad4x4_neon),
 };
 INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
 
+const SadMxNAvgParam avg_neon_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon),
+  SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_neon),
+  SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_neon),
+  SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon),
+  SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon),
+  SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon),
+};
+INSTANTIATE_TEST_CASE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
+
 const SadMxNx4Param x4d_neon_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon),
   SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon),
   SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon),
+  SadMxNx4Param(8, 16, &vpx_sad8x16x4d_neon),
+  SadMxNx4Param(8, 8, &vpx_sad8x8x4d_neon),
+  SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon),
+  SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon),
+  SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon),
 };
 INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
 #endif  // HAVE_NEON
@@ -865,6 +896,14 @@ const SadMxNx4Param x4d_avx2_tests[] = {
 INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 #endif  // HAVE_AVX2
 
+#if HAVE_AVX512
+const SadMxNx4Param x4d_avx512_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx512),
+};
+INSTANTIATE_TEST_CASE_P(AVX512, SADx4Test,
+                        ::testing::ValuesIn(x4d_avx512_tests));
+#endif  // HAVE_AVX512
+
 //------------------------------------------------------------------------------
 // MIPS functions
 #if HAVE_MSA
@@ -920,4 +959,98 @@ const SadMxNx4Param x4d_msa_tests[] = {
 INSTANTIATE_TEST_CASE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests));
 #endif  // HAVE_MSA
 
+//------------------------------------------------------------------------------
+// VSX functions
+#if HAVE_VSX
+const SadMxNParam vsx_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_vsx),
+  SadMxNParam(64, 32, &vpx_sad64x32_vsx),
+  SadMxNParam(32, 64, &vpx_sad32x64_vsx),
+  SadMxNParam(32, 32, &vpx_sad32x32_vsx),
+  SadMxNParam(32, 16, &vpx_sad32x16_vsx),
+  SadMxNParam(16, 32, &vpx_sad16x32_vsx),
+  SadMxNParam(16, 16, &vpx_sad16x16_vsx),
+  SadMxNParam(16, 8, &vpx_sad16x8_vsx),
+};
+INSTANTIATE_TEST_CASE_P(VSX, SADTest, ::testing::ValuesIn(vsx_tests));
+
+const SadMxNAvgParam avg_vsx_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_vsx),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_vsx),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_vsx),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_vsx),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_vsx),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_vsx),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_vsx),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_vsx),
+};
+INSTANTIATE_TEST_CASE_P(VSX, SADavgTest, ::testing::ValuesIn(avg_vsx_tests));
+
+const SadMxNx4Param x4d_vsx_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_vsx),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_vsx),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_vsx),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_vsx),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_vsx),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_vsx),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_vsx),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_vsx),
+};
+INSTANTIATE_TEST_CASE_P(VSX, SADx4Test, ::testing::ValuesIn(x4d_vsx_tests));
+#endif  // HAVE_VSX
+
+//------------------------------------------------------------------------------
+// Loongson functions
+#if HAVE_MMI
+const SadMxNParam mmi_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_mmi),
+  SadMxNParam(64, 32, &vpx_sad64x32_mmi),
+  SadMxNParam(32, 64, &vpx_sad32x64_mmi),
+  SadMxNParam(32, 32, &vpx_sad32x32_mmi),
+  SadMxNParam(32, 16, &vpx_sad32x16_mmi),
+  SadMxNParam(16, 32, &vpx_sad16x32_mmi),
+  SadMxNParam(16, 16, &vpx_sad16x16_mmi),
+  SadMxNParam(16, 8, &vpx_sad16x8_mmi),
+  SadMxNParam(8, 16, &vpx_sad8x16_mmi),
+  SadMxNParam(8, 8, &vpx_sad8x8_mmi),
+  SadMxNParam(8, 4, &vpx_sad8x4_mmi),
+  SadMxNParam(4, 8, &vpx_sad4x8_mmi),
+  SadMxNParam(4, 4, &vpx_sad4x4_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADTest, ::testing::ValuesIn(mmi_tests));
+
+const SadMxNAvgParam avg_mmi_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_mmi),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_mmi),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_mmi),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_mmi),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_mmi),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_mmi),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_mmi),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_mmi),
+  SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_mmi),
+  SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_mmi),
+  SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_mmi),
+  SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_mmi),
+  SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADavgTest, ::testing::ValuesIn(avg_mmi_tests));
+
+const SadMxNx4Param x4d_mmi_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_mmi),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_mmi),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_mmi),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_mmi),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_mmi),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_mmi),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_mmi),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_mmi),
+  SadMxNx4Param(8, 16, &vpx_sad8x16x4d_mmi),
+  SadMxNx4Param(8, 8, &vpx_sad8x8x4d_mmi),
+  SadMxNx4Param(8, 4, &vpx_sad8x4x4d_mmi),
+  SadMxNx4Param(4, 8, &vpx_sad4x8x4d_mmi),
+  SadMxNx4Param(4, 4, &vpx_sad4x4x4d_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests));
+#endif  // HAVE_MMI
 }  // namespace
diff --git a/libs/libvpx/test/set_roi.cc b/libs/libvpx/test/set_roi.cc
index 38711a806d..f639547523 100644
--- a/libs/libvpx/test/set_roi.cc
+++ b/libs/libvpx/test/set_roi.cc
@@ -146,14 +146,6 @@ TEST(VP8RoiMapTest, ParameterCheck) {
       if (deltas_valid != roi_retval) break;
     }
 
-    // Test that we report and error if cyclic refresh is enabled.
-    cpi.cyclic_refresh_mode_enabled = 1;
-    roi_retval =
-        vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, cpi.common.mb_cols,
-                       delta_q, delta_lf, threshold);
-    EXPECT_EQ(-1, roi_retval) << "cyclic refresh check error";
-    cpi.cyclic_refresh_mode_enabled = 0;
-
     // Test invalid number of rows or colums.
     roi_retval =
         vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1,
diff --git a/libs/libvpx/test/stress.sh b/libs/libvpx/test/stress.sh
new file mode 100755
index 0000000000..a899c800ca
--- /dev/null
+++ b/libs/libvpx/test/stress.sh
@@ -0,0 +1,169 @@
+#!/bin/sh
+##
+##  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file performs a stress test. It runs (STRESS_ONEPASS_MAX_JOBS,
+##  default=5) one, (STRESS_TWOPASS_MAX_JOBS, default=5) two pass &
+##  (STRESS_RT_MAX_JOBS, default=5) encodes and (STRESS_<codec>_DECODE_MAX_JOBS,
+##  default=30) decodes in parallel.
+
+. $(dirname $0)/tools_common.sh
+
+YUV="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.yuv"
+VP8="${LIBVPX_TEST_DATA_PATH}/tos_vp8.webm"
+VP9="${LIBVPX_TEST_DATA_PATH}/vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm"
+DATA_URL="http://downloads.webmproject.org/test_data/libvpx/"
+SHA1_FILE="$(dirname $0)/test-data.sha1"
+
+# Set sha1sum to proper sha program (sha1sum, shasum, sha1). This code is
+# cribbed from libs.mk.
+[ -x "$(which sha1sum)" ] && sha1sum=sha1sum
+[ -x "$(which shasum)" ] && sha1sum=shasum
+[ -x "$(which sha1)" ] && sha1sum=sha1
+
+# Download a file from the url and check its sha1sum.
+download_and_check_file() {
+  # Get the file from the file path.
+  local readonly root="${1#${LIBVPX_TEST_DATA_PATH}/}"
+
+  # Download the file using curl. Trap to insure non partial file.
+  (trap "rm -f $1" INT TERM \
+    && eval "curl --retry 1 -L -o $1 ${DATA_URL}${root} ${devnull}")
+
+  # Check the sha1 sum of the file.
+  if [ -n "${sha1sum}" ]; then
+    set -e
+    grep ${root} ${SHA1_FILE} \
+      | (cd ${LIBVPX_TEST_DATA_PATH}; ${sha1sum} -c);
+  fi
+}
+
+# Environment check: Make sure input is available.
+stress_verify_environment() {
+  if [ ! -e "${SHA1_FILE}" ] ; then
+    echo "Missing ${SHA1_FILE}"
+    return 1
+  fi
+  for file in "${YUV}" "${VP8}" "${VP9}"; do
+    if [ ! -e "${file}" ] ; then
+      download_and_check_file "${file}"
+    fi
+  done
+  if [ ! -e "${YUV}" ] || [ ! -e "${VP8}" ] || [ ! -e "${VP9}" ] ; then
+    elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ -z "$(vpx_tool_path vpxenc)" ]; then
+    elog "vpxenc not found. It must exist in LIBVPX_BIN_PATH or its parent."
+    return 1
+  fi
+  if [ -z "$(vpx_tool_path vpxdec)" ]; then
+    elog "vpxdec not found. It must exist in LIBVPX_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+# This function runs tests on libvpx that run multiple encodes and decodes
+# in parallel in hopes of catching synchronization and/or threading issues.
+stress() {
+  local readonly decoder="$(vpx_tool_path vpxdec)"
+  local readonly encoder="$(vpx_tool_path vpxenc)"
+  local readonly codec="$1"
+  local readonly webm="$2"
+  local readonly decode_count="$3"
+  local readonly threads="$4"
+  local readonly enc_args="$5"
+  local pids=""
+  local rt_max_jobs=${STRESS_RT_MAX_JOBS:-5}
+  local onepass_max_jobs=${STRESS_ONEPASS_MAX_JOBS:-5}
+  local twopass_max_jobs=${STRESS_TWOPASS_MAX_JOBS:-5}
+
+  # Enable job control, so we can run multiple processes.
+  set -m
+
+  # Start $onepass_max_jobs encode jobs in parallel.
+  for i in $(seq ${onepass_max_jobs}); do
+    bitrate=$(($i * 20 + 300))
+    eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \
+      "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal --passes=1" \
+      "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.1pass.webm" \
+      "${enc_args}" ${devnull} &
+    pids="${pids} $!"
+  done
+
+  # Start $twopass_max_jobs encode jobs in parallel.
+  for i in $(seq ${twopass_max_jobs}); do
+    bitrate=$(($i * 20 + 300))
+    eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \
+      "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal --passes=2" \
+      "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.2pass.webm" \
+      "${enc_args}" ${devnull} &
+    pids="${pids} $!"
+  done
+
+  # Start $rt_max_jobs rt encode jobs in parallel.
+  for i in $(seq ${rt_max_jobs}); do
+    bitrate=$(($i * 20 + 300))
+    eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \
+      "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal " \
+      "--target-bitrate=${bitrate} --lag-in-frames=0 --error-resilient=1" \
+      "--kf-min-dist=3000 --kf-max-dist=3000 --cpu-used=-6 --static-thresh=1" \
+      "--end-usage=cbr --min-q=2 --max-q=56 --undershoot-pct=100" \
+      "--overshoot-pct=15 --buf-sz=1000 --buf-initial-sz=500" \
+      "--buf-optimal-sz=600 --max-intra-rate=900 --resize-allowed=0" \
+      "--drop-frame=0 --passes=1 --rt --noise-sensitivity=4" \
+      "-o ${VPX_TEST_OUTPUT_DIR}/${i}.rt.webm" ${devnull} &
+    pids="${pids} $!"
+  done
+
+  # Start $decode_count decode jobs in parallel.
+  for i in $(seq "${decode_count}"); do
+    eval "${decoder}" "-t ${threads}" "${webm}" "--noblit" ${devnull} &
+    pids="${pids} $!"
+  done
+
+  # Wait for all parallel jobs to finish.
+  fail=0
+  for job in "${pids}"; do
+    wait $job || fail=$(($fail + 1))
+  done
+  return $fail
+}
+
+vp8_stress_test() {
+  local vp8_max_jobs=${STRESS_VP8_DECODE_MAX_JOBS:-40}
+  if [ "$(vp8_decode_available)" = "yes" -a \
+       "$(vp8_encode_available)" = "yes" ]; then
+    stress vp8 "${VP8}" "${vp8_max_jobs}" 4
+  fi
+}
+
+vp9_stress() {
+  local vp9_max_jobs=${STRESS_VP9_DECODE_MAX_JOBS:-25}
+
+  if [ "$(vp9_decode_available)" = "yes" -a \
+       "$(vp9_encode_available)" = "yes" ]; then
+    stress vp9 "${VP9}" "${vp9_max_jobs}" "$@"
+  fi
+}
+
+vp9_stress_test() {
+  for threads in 4 8 100; do
+    vp9_stress "$threads" "--row-mt=0"
+  done
+}
+
+vp9_stress_test_row_mt() {
+  for threads in 4 8 100; do
+    vp9_stress "$threads" "--row-mt=1"
+  done
+}
+
+run_tests stress_verify_environment \
+  "vp8_stress_test vp9_stress_test vp9_stress_test_row_mt"
diff --git a/libs/libvpx/test/sum_squares_test.cc b/libs/libvpx/test/sum_squares_test.cc
index 3aa43d2ce0..9c407c649f 100644
--- a/libs/libvpx/test/sum_squares_test.cc
+++ b/libs/libvpx/test/sum_squares_test.cc
@@ -110,4 +110,10 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
                                  &vpx_sum_squares_2d_i16_sse2)));
 #endif  // HAVE_SSE2
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, SumSquaresTest, ::testing::Values(make_tuple(
+                                                 &vpx_sum_squares_2d_i16_c,
+                                                 &vpx_sum_squares_2d_i16_msa)));
+#endif  // HAVE_MSA
 }  // namespace
diff --git a/libs/libvpx/test/svc_test.cc b/libs/libvpx/test/svc_test.cc
index 949a820c4f..482d9fffa1 100644
--- a/libs/libvpx/test/svc_test.cc
+++ b/libs/libvpx/test/svc_test.cc
@@ -438,7 +438,7 @@ TEST_F(SvcTest, SetAutoAltRefOption) {
 // Test that decoder can handle an SVC frame as the first frame in a sequence.
 TEST_F(SvcTest, OnePassEncodeOneFrame) {
   codec_enc_.g_pass = VPX_RC_ONE_PASS;
-  vpx_fixed_buf output = { 0 };
+  vpx_fixed_buf output = vpx_fixed_buf();
   Pass2EncodeNFrames(NULL, 1, 2, &output);
   DecodeNFrames(&output, 1);
   FreeBitstreamBuffers(&output, 1);
diff --git a/libs/libvpx/test/temporal_filter_test.cc b/libs/libvpx/test/temporal_filter_test.cc
new file mode 100644
index 0000000000..655a36be9a
--- /dev/null
+++ b/libs/libvpx/test/temporal_filter_test.cc
@@ -0,0 +1,277 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/register_state_check.h"
+#include "vpx_ports/vpx_timer.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+using ::libvpx_test::Buffer;
+
+typedef void (*TemporalFilterFunc)(const uint8_t *a, unsigned int stride,
+                                   const uint8_t *b, unsigned int w,
+                                   unsigned int h, int filter_strength,
+                                   int filter_weight, unsigned int *accumulator,
+                                   uint16_t *count);
+
+// Calculate the difference between 'a' and 'b', sum in blocks of 9, and apply
+// filter based on strength and weight. Store the resulting filter amount in
+// 'count' and apply it to 'b' and store it in 'accumulator'.
+void reference_filter(const Buffer<uint8_t> &a, const Buffer<uint8_t> &b, int w,
+                      int h, int filter_strength, int filter_weight,
+                      Buffer<unsigned int> *accumulator,
+                      Buffer<uint16_t> *count) {
+  Buffer<int> diff_sq = Buffer<int>(w, h, 0);
+  ASSERT_TRUE(diff_sq.Init());
+  diff_sq.Set(0);
+
+  int rounding = 0;
+  if (filter_strength > 0) {
+    rounding = 1 << (filter_strength - 1);
+  }
+
+  // Calculate all the differences. Avoids re-calculating a bunch of extra
+  // values.
+  for (int height = 0; height < h; ++height) {
+    for (int width = 0; width < w; ++width) {
+      int diff = a.TopLeftPixel()[height * a.stride() + width] -
+                 b.TopLeftPixel()[height * b.stride() + width];
+      diff_sq.TopLeftPixel()[height * diff_sq.stride() + width] = diff * diff;
+    }
+  }
+
+  // For any given point, sum the neighboring values and calculate the
+  // modifier.
+  for (int height = 0; height < h; ++height) {
+    for (int width = 0; width < w; ++width) {
+      // Determine how many values are being summed.
+      int summed_values = 9;
+
+      if (height == 0 || height == (h - 1)) {
+        summed_values -= 3;
+      }
+
+      if (width == 0 || width == (w - 1)) {
+        if (summed_values == 6) {  // corner
+          summed_values -= 2;
+        } else {
+          summed_values -= 3;
+        }
+      }
+
+      // Sum the diff_sq of the surrounding values.
+      int sum = 0;
+      for (int idy = -1; idy <= 1; ++idy) {
+        for (int idx = -1; idx <= 1; ++idx) {
+          const int y = height + idy;
+          const int x = width + idx;
+
+          // If inside the border.
+          if (y >= 0 && y < h && x >= 0 && x < w) {
+            sum += diff_sq.TopLeftPixel()[y * diff_sq.stride() + x];
+          }
+        }
+      }
+
+      sum *= 3;
+      sum /= summed_values;
+      sum += rounding;
+      sum >>= filter_strength;
+
+      // Clamp the value and invert it.
+      if (sum > 16) sum = 16;
+      sum = 16 - sum;
+
+      sum *= filter_weight;
+
+      count->TopLeftPixel()[height * count->stride() + width] += sum;
+      accumulator->TopLeftPixel()[height * accumulator->stride() + width] +=
+          sum * b.TopLeftPixel()[height * b.stride() + width];
+    }
+  }
+}
+
+class TemporalFilterTest : public ::testing::TestWithParam<TemporalFilterFunc> {
+ public:
+  virtual void SetUp() {
+    filter_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  TemporalFilterFunc filter_func_;
+  ACMRandom rnd_;
+};
+
+TEST_P(TemporalFilterTest, SizeCombinations) {
+  // Depending on subsampling this function may be called with values of 8 or 16
+  // for width and height, in any combination.
+  Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
+  ASSERT_TRUE(a.Init());
+
+  const int filter_weight = 2;
+  const int filter_strength = 6;
+
+  for (int width = 8; width <= 16; width += 8) {
+    for (int height = 8; height <= 16; height += 8) {
+      // The second buffer must not have any border.
+      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+      ASSERT_TRUE(b.Init());
+      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_ref.Init());
+      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_chk.Init());
+      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_ref.Init());
+      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_chk.Init());
+
+      // The difference between the buffers must be small to pass the threshold
+      // to apply the filter.
+      a.Set(&rnd_, 0, 7);
+      b.Set(&rnd_, 0, 7);
+
+      accum_ref.Set(rnd_.Rand8());
+      accum_chk.CopyFrom(accum_ref);
+      count_ref.Set(rnd_.Rand8());
+      count_chk.CopyFrom(count_ref);
+      reference_filter(a, b, width, height, filter_strength, filter_weight,
+                       &accum_ref, &count_ref);
+      ASM_REGISTER_STATE_CHECK(
+          filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
+                       height, filter_strength, filter_weight,
+                       accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()));
+      EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
+      EXPECT_TRUE(count_chk.CheckValues(count_ref));
+      if (HasFailure()) {
+        printf("Width: %d Height: %d\n", width, height);
+        count_chk.PrintDifference(count_ref);
+        accum_chk.PrintDifference(accum_ref);
+        return;
+      }
+    }
+  }
+}
+
+TEST_P(TemporalFilterTest, CompareReferenceRandom) {
+  for (int width = 8; width <= 16; width += 8) {
+    for (int height = 8; height <= 16; height += 8) {
+      Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
+      ASSERT_TRUE(a.Init());
+      // The second buffer must not have any border.
+      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+      ASSERT_TRUE(b.Init());
+      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_ref.Init());
+      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_chk.Init());
+      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_ref.Init());
+      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_chk.Init());
+
+      for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
+        for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
+          for (int repeat = 0; repeat < 100; ++repeat) {
+            if (repeat < 50) {
+              a.Set(&rnd_, 0, 7);
+              b.Set(&rnd_, 0, 7);
+            } else {
+              // Check large (but close) values as well.
+              a.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,
+                    std::numeric_limits<uint8_t>::max());
+              b.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,
+                    std::numeric_limits<uint8_t>::max());
+            }
+
+            accum_ref.Set(rnd_.Rand8());
+            accum_chk.CopyFrom(accum_ref);
+            count_ref.Set(rnd_.Rand8());
+            count_chk.CopyFrom(count_ref);
+            reference_filter(a, b, width, height, filter_strength,
+                             filter_weight, &accum_ref, &count_ref);
+            ASM_REGISTER_STATE_CHECK(filter_func_(
+                a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, height,
+                filter_strength, filter_weight, accum_chk.TopLeftPixel(),
+                count_chk.TopLeftPixel()));
+            EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
+            EXPECT_TRUE(count_chk.CheckValues(count_ref));
+            if (HasFailure()) {
+              printf("Weight: %d Strength: %d\n", filter_weight,
+                     filter_strength);
+              count_chk.PrintDifference(count_ref);
+              accum_chk.PrintDifference(accum_ref);
+              return;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_P(TemporalFilterTest, DISABLED_Speed) {
+  Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
+  ASSERT_TRUE(a.Init());
+
+  const int filter_weight = 2;
+  const int filter_strength = 6;
+
+  for (int width = 8; width <= 16; width += 8) {
+    for (int height = 8; height <= 16; height += 8) {
+      // The second buffer must not have any border.
+      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+      ASSERT_TRUE(b.Init());
+      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_ref.Init());
+      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_chk.Init());
+      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_ref.Init());
+      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_chk.Init());
+
+      a.Set(&rnd_, 0, 7);
+      b.Set(&rnd_, 0, 7);
+
+      accum_chk.Set(0);
+      count_chk.Set(0);
+
+      vpx_usec_timer timer;
+      vpx_usec_timer_start(&timer);
+      for (int i = 0; i < 10000; ++i) {
+        filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
+                     height, filter_strength, filter_weight,
+                     accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());
+      }
+      vpx_usec_timer_mark(&timer);
+      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+      printf("Temporal filter %dx%d time: %5d us\n", width, height,
+             elapsed_time);
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, TemporalFilterTest,
+                        ::testing::Values(&vp9_temporal_filter_apply_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, TemporalFilterTest,
+                        ::testing::Values(&vp9_temporal_filter_apply_sse4_1));
+#endif  // HAVE_SSE4_1
+}  // namespace
diff --git a/libs/libvpx/test/test-data.mk b/libs/libvpx/test/test-data.mk
index da2fd77d42..f405e4ef14 100644
--- a/libs/libvpx/test/test-data.mk
+++ b/libs/libvpx/test/test-data.mk
@@ -20,8 +20,10 @@ LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_440.yuv
 
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_credits.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += noisy_clip_640_360.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
 
 # Test vectors
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
@@ -730,6 +732,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm.md5
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 # Invalid files for testing libvpx error checking.
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm
@@ -764,15 +768,23 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s195
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm.res
 
 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # Encode / Decode test
@@ -807,7 +819,6 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += thaloundeskmtg_640_480_30.yuv
@@ -865,3 +876,7 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm.md5
diff --git a/libs/libvpx/test/test-data.sha1 b/libs/libvpx/test/test-data.sha1
index 7dd4fcf156..99b4e1e465 100644
--- a/libs/libvpx/test/test-data.sha1
+++ b/libs/libvpx/test/test-data.sha1
@@ -6,6 +6,8 @@ b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
 c123d1f9f02fb4143abb5e271916e3a3080de8f6 *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+efafb92b7567bc04c3f1432ea6c268c1c31affd5 *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res
 fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v3.webm
 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-01-v3.webm.res
 d78e2fceba5ac942246503ec8366f879c4775ca5 *invalid-vp90-02-v2.webm
@@ -14,6 +16,7 @@ df1a1453feb3c00d7d89746c7003b4163523bff3 *invalid-vp90-03-v3.webm
 4935c62becc68c13642a03db1e6d3e2331c1c612 *invalid-vp90-03-v3.webm.res
 d637297561dd904eb2c97a9015deeb31c4a1e8d2 *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
 3a204bdbeaa3c6458b77bcebb8366d107267f55d *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
+9aa21d8b2cb9d39abe8a7bb6032dc66955fb4342 *noisy_clip_640_360.y4m
 a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m
 0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m
 ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m
@@ -838,3 +841,16 @@ a000d568431d07379dd5a8ec066061c07e560b47 *invalid-vp90-2-00-quantizer-63.ivf.kf_
 1e75aad3433c5c21c194a7b53fc393970f0a8d7f *invalid-vp90-2-00-quantizer-63.ivf.kf_65527x61446.ivf.res
 235182f9a1c5c8841552510dd4288487447bfc40 *invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf
 787f04f0483320d536894282f3358a4f8cac1cf9 *invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
+91d3cefd0deb98f3b0caf3a2d900ec7a7605e53a *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf
+1e472baaf5f6113459f0399a38a5a5e68d17799d *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf.res
+70057835bf29d14e66699ce5f022df2551fb6b37 *invalid-crbug-629481.webm
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-crbug-629481.webm.res
+7602e00378161ca36ae93cc6ee12dd30b5ba1e1d *vp90-2-22-svc_1280x720_3.ivf
+02e53e3eefbf25ec0929047fe50876acdeb040bd *vp90-2-22-svc_1280x720_3.ivf.md5
+6fa3d3ac306a3d9ce1d610b78441dc00d2c2d4b9 *tos_vp8.webm
+e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res
+fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf
+fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res
+17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm
+e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5
diff --git a/libs/libvpx/test/test.mk b/libs/libvpx/test/test.mk
index 60218a780d..a3716be60c 100644
--- a/libs/libvpx/test/test.mk
+++ b/libs/libvpx/test/test.mk
@@ -1,4 +1,5 @@
 LIBVPX_TEST_SRCS-yes += acm_random.h
+LIBVPX_TEST_SRCS-yes += buffer.h
 LIBVPX_TEST_SRCS-yes += clear_system_state.h
 LIBVPX_TEST_SRCS-yes += codec_factory.h
 LIBVPX_TEST_SRCS-yes += md5_helper.h
@@ -35,9 +36,9 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
@@ -46,6 +47,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_end_to_end_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_motion_vector_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += level_test.cc
 
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
@@ -121,6 +123,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += predict_test.cc
 LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.cc
+LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.h
 
 ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes)
 LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp8_denoiser_sse2_test.cc
@@ -148,14 +151,20 @@ LIBVPX_TEST_SRCS-yes                   += vp9_intrapred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += avg_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += comp_avg_pred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_partial_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc
+ifneq ($(CONFIG_REALTIME_ONLY),yes)
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += temporal_filter_test.cc
+endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 
@@ -166,7 +175,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc
 endif
 
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
-LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp9_denoiser_sse2_test.cc
+LIBVPX_TEST_SRCS-yes += vp9_denoiser_test.cc
 endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc
 
diff --git a/libs/libvpx/test/test_intra_pred_speed.cc b/libs/libvpx/test/test_intra_pred_speed.cc
index a896307534..1cdeda410a 100644
--- a/libs/libvpx/test/test_intra_pred_speed.cc
+++ b/libs/libvpx/test/test_intra_pred_speed.cc
@@ -29,6 +29,8 @@ namespace {
 typedef void (*VpxPredFunc)(uint8_t *dst, ptrdiff_t y_stride,
                             const uint8_t *above, const uint8_t *left);
 
+const int kBPS = 32;
+const int kTotalPixels = 32 * kBPS;
 const int kNumVp9IntraPredFuncs = 13;
 const char *kVp9IntraPredNames[kNumVp9IntraPredFuncs] = {
   "DC_PRED",   "DC_LEFT_PRED", "DC_TOP_PRED", "DC_128_PRED", "V_PRED",
@@ -36,107 +38,121 @@ const char *kVp9IntraPredNames[kNumVp9IntraPredFuncs] = {
   "D207_PRED", "D63_PRED",     "TM_PRED"
 };
 
+template <typename Pixel>
+struct IntraPredTestMem {
+  void Init(int block_size, int bd) {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    Pixel *const above = above_mem + 16;
+    const int mask = (1 << bd) - 1;
+    for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand16() & mask;
+    for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand16() & mask;
+    for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand16() & mask;
+
+    // some code assumes the top row has been extended:
+    // d45/d63 C-code, for instance, but not the assembly.
+    // TODO(jzern): this style of extension isn't strictly necessary.
+    ASSERT_LE(block_size, kBPS);
+    for (int i = block_size; i < 2 * kBPS; ++i) {
+      above[i] = above[block_size - 1];
+    }
+  }
+
+  DECLARE_ALIGNED(16, Pixel, src[kTotalPixels]);
+  DECLARE_ALIGNED(16, Pixel, ref_src[kTotalPixels]);
+  DECLARE_ALIGNED(16, Pixel, left[kBPS]);
+  DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]);
+};
+
+typedef IntraPredTestMem<uint8_t> Vp9IntraPredTestMem;
+
+void CheckMd5Signature(const char name[], const char *const signatures[],
+                       const void *data, size_t data_size, int elapsed_time,
+                       int idx) {
+  libvpx_test::MD5 md5;
+  md5.Add(reinterpret_cast<const uint8_t *>(data), data_size);
+  printf("Mode %s[%12s]: %5d ms     MD5: %s\n", name, kVp9IntraPredNames[idx],
+         elapsed_time, md5.Get());
+  EXPECT_STREQ(signatures[idx], md5.Get());
+}
+
 void TestIntraPred(const char name[], VpxPredFunc const *pred_funcs,
-                   const char *const pred_func_names[], int num_funcs,
-                   const char *const signatures[], int block_size,
-                   int num_pixels_per_test) {
-  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
-  const int kBPS = 32;
-  const int kTotalPixels = 32 * kBPS;
-  DECLARE_ALIGNED(16, uint8_t, src[kTotalPixels]);
-  DECLARE_ALIGNED(16, uint8_t, ref_src[kTotalPixels]);
-  DECLARE_ALIGNED(16, uint8_t, left[kBPS]);
-  DECLARE_ALIGNED(16, uint8_t, above_mem[2 * kBPS + 16]);
-  uint8_t *const above = above_mem + 16;
-  for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand8();
-  for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand8();
-  for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand8();
-  const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
+                   const char *const signatures[], int block_size) {
+  const int kNumTests = static_cast<int>(
+      2.e10 / (block_size * block_size * kNumVp9IntraPredFuncs));
+  Vp9IntraPredTestMem intra_pred_test_mem;
+  const uint8_t *const above = intra_pred_test_mem.above_mem + 16;
 
-  // some code assumes the top row has been extended:
-  // d45/d63 C-code, for instance, but not the assembly.
-  // TODO(jzern): this style of extension isn't strictly necessary.
-  ASSERT_LE(block_size, kBPS);
-  memset(above + block_size, above[block_size - 1], 2 * kBPS - block_size);
+  intra_pred_test_mem.Init(block_size, 8);
 
-  for (int k = 0; k < num_funcs; ++k) {
+  for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) {
     if (pred_funcs[k] == NULL) continue;
-    memcpy(src, ref_src, sizeof(src));
+    memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
+           sizeof(intra_pred_test_mem.src));
     vpx_usec_timer timer;
     vpx_usec_timer_start(&timer);
     for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
-      pred_funcs[k](src, kBPS, above, left);
+      pred_funcs[k](intra_pred_test_mem.src, kBPS, above,
+                    intra_pred_test_mem.left);
     }
     libvpx_test::ClearSystemState();
     vpx_usec_timer_mark(&timer);
     const int elapsed_time =
         static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
-    libvpx_test::MD5 md5;
-    md5.Add(src, sizeof(src));
-    printf("Mode %s[%12s]: %5d ms     MD5: %s\n", name, pred_func_names[k],
-           elapsed_time, md5.Get());
-    EXPECT_STREQ(signatures[k], md5.Get());
+    CheckMd5Signature(name, signatures, intra_pred_test_mem.src,
+                      sizeof(intra_pred_test_mem.src), elapsed_time, k);
   }
 }
 
 void TestIntraPred4(VpxPredFunc const *pred_funcs) {
-  static const int kNumVp9IntraFuncs = 13;
-  static const char *const kSignatures[kNumVp9IntraFuncs] = {
-    "4334156168b34ab599d9b5b30f522fe9", "bc4649d5ba47c7ff178d92e475960fb0",
-    "8d316e5933326dcac24e1064794b5d12", "a27270fed024eafd762c95de85f4da51",
-    "c33dff000d4256c2b8f3bf9e9bab14d2", "44d8cddc2ad8f79b8ed3306051722b4f",
-    "eb54839b2bad6699d8946f01ec041cd0", "ecb0d56ae5f677ea45127ce9d5c058e4",
-    "0b7936841f6813da818275944895b574", "9117972ef64f91a58ff73e1731c81db2",
-    "c56d5e8c729e46825f46dd5d3b5d508a", "c0889e2039bcf7bcb5d2f33cdca69adc",
-    "309a618577b27c648f9c5ee45252bc8f",
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "e7ed7353c3383fff942e500e9bfe82fe", "2a4a26fcc6ce005eadc08354d196c8a9",
+    "269d92eff86f315d9c38fe7640d85b15", "ae2960eea9f71ee3dabe08b282ec1773",
+    "6c1abcc44e90148998b51acd11144e9c", "f7bb3186e1ef8a2b326037ff898cad8e",
+    "364c1f3fb2f445f935aec2a70a67eaa4", "141624072a4a56773f68fadbdd07c4a7",
+    "7be49b08687a5f24df3a2c612fca3876", "459bb5d9fd5b238348179c9a22108cd6",
+    "73edb8831bf1bdfce21ae8eaa43b1234", "2e2457f2009c701a355a8b25eb74fcda",
+    "52ae4e8bdbe41494c1f43051d4dd7f0b"
   };
-  TestIntraPred("Intra4", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
-                kSignatures, 4, 4 * 4 * kNumVp9IntraFuncs);
+  TestIntraPred("Intra4", pred_funcs, kSignatures, 4);
 }
 
 void TestIntraPred8(VpxPredFunc const *pred_funcs) {
-  static const int kNumVp9IntraFuncs = 13;
-  static const char *const kSignatures[kNumVp9IntraFuncs] = {
-    "7694ddeeefed887faf9d339d18850928", "7d726b1213591b99f736be6dec65065b",
-    "19c5711281357a485591aaf9c96c0a67", "ba6b66877a089e71cd938e3b8c40caac",
-    "802440c93317e0f8ba93fab02ef74265", "9e09a47a15deb0b9d8372824f9805080",
-    "b7c2d8c662268c0c427da412d7b0311d", "78339c1c60bb1d67d248ab8c4da08b7f",
-    "5c97d70f7d47de1882a6cd86c165c8a9", "8182bf60688b42205acd95e59e967157",
-    "08323400005a297f16d7e57e7fe1eaac", "95f7bfc262329a5849eda66d8f7c68ce",
-    "815b75c8e0d91cc1ae766dc5d3e445a3",
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "d8bbae5d6547cfc17e4f5f44c8730e88", "373bab6d931868d41a601d9d88ce9ac3",
+    "6fdd5ff4ff79656c14747598ca9e3706", "d9661c2811d6a73674f40ffb2b841847",
+    "7c722d10b19ccff0b8c171868e747385", "f81dd986eb2b50f750d3a7da716b7e27",
+    "d500f2c8fc78f46a4c74e4dcf51f14fb", "0e3523f9cab2142dd37fd07ec0760bce",
+    "79ac4efe907f0a0f1885d43066cfedee", "19ecf2432ac305057de3b6578474eec6",
+    "4f985b61acc6dd5d2d2585fa89ea2e2d", "f1bb25a9060dd262f405f15a38f5f674",
+    "209ea00801584829e9a0f7be7d4a74ba"
   };
-  TestIntraPred("Intra8", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
-                kSignatures, 8, 8 * 8 * kNumVp9IntraFuncs);
+  TestIntraPred("Intra8", pred_funcs, kSignatures, 8);
 }
 
 void TestIntraPred16(VpxPredFunc const *pred_funcs) {
-  static const int kNumVp9IntraFuncs = 13;
-  static const char *const kSignatures[kNumVp9IntraFuncs] = {
-    "b40dbb555d5d16a043dc361e6694fe53", "fb08118cee3b6405d64c1fd68be878c6",
-    "6c190f341475c837cc38c2e566b64875", "db5c34ccbe2c7f595d9b08b0dc2c698c",
-    "a62cbfd153a1f0b9fed13e62b8408a7a", "143df5b4c89335e281103f610f5052e4",
-    "d87feb124107cdf2cfb147655aa0bb3c", "7841fae7d4d47b519322e6a03eeed9dc",
-    "f6ebed3f71cbcf8d6d0516ce87e11093", "3cc480297dbfeed01a1c2d78dd03d0c5",
-    "b9f69fa6532b372c545397dcb78ef311", "a8fe1c70432f09d0c20c67bdb6432c4d",
-    "b8a41aa968ec108af447af4217cba91b",
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "50971c07ce26977d30298538fffec619", "527a6b9e0dc5b21b98cf276305432bef",
+    "7eff2868f80ebc2c43a4f367281d80f7", "67cd60512b54964ef6aff1bd4816d922",
+    "48371c87dc95c08a33b2048f89cf6468", "b0acf2872ee411d7530af6d2625a7084",
+    "f32aafed4d8d3776ed58bcb6188756d5", "dae208f3dca583529cff49b73f7c4183",
+    "7af66a2f4c8e0b4908e40f047e60c47c", "125e3ab6ab9bc961f183ec366a7afa88",
+    "6b90f25b23983c35386b9fd704427622", "f8d6b11d710edc136a7c62c917435f93",
+    "ed308f18614a362917f411c218aee532"
   };
-  TestIntraPred("Intra16", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
-                kSignatures, 16, 16 * 16 * kNumVp9IntraFuncs);
+  TestIntraPred("Intra16", pred_funcs, kSignatures, 16);
 }
 
 void TestIntraPred32(VpxPredFunc const *pred_funcs) {
-  static const int kNumVp9IntraFuncs = 13;
-  static const char *const kSignatures[kNumVp9IntraFuncs] = {
-    "558541656d84f9ae7896db655826febe", "b3587a1f9a01495fa38c8cd3c8e2a1bf",
-    "4c6501e64f25aacc55a2a16c7e8f0255", "b3b01379ba08916ef6b1b35f7d9ad51c",
-    "0f1eb38b6cbddb3d496199ef9f329071", "911c06efb9ed1c3b4c104b232b55812f",
-    "9225beb0ddfa7a1d24eaa1be430a6654", "0a6d584a44f8db9aa7ade2e2fdb9fc9e",
-    "b01c9076525216925f3456f034fb6eee", "d267e20ad9e5cd2915d1a47254d3d149",
-    "ed012a4a5da71f36c2393023184a0e59", "f162b51ed618d28b936974cff4391da5",
-    "9e1370c6d42e08d357d9612c93a71cfc",
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "a0a618c900e65ae521ccc8af789729f2", "985aaa7c72b4a6c2fb431d32100cf13a",
+    "10662d09febc3ca13ee4e700120daeb5", "b3b01379ba08916ef6b1b35f7d9ad51c",
+    "9f4261755795af97e34679c333ec7004", "bc2c9da91ad97ef0d1610fb0a9041657",
+    "75c79b1362ad18abfcdb1aa0aacfc21d", "4039bb7da0f6860090d3c57b5c85468f",
+    "b29fff7b61804e68383e3a609b33da58", "e1aa5e49067fd8dba66c2eb8d07b7a89",
+    "4e042822909c1c06d3b10a88281df1eb", "72eb9d9e0e67c93f4c66b70348e9fef7",
+    "a22d102bcb51ca798aac12ca4ae8f2e8"
   };
-  TestIntraPred("Intra32", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
-                kSignatures, 32, 32 * 32 * kNumVp9IntraFuncs);
+  TestIntraPred("Intra32", pred_funcs, kSignatures, 32);
 }
 
 }  // namespace
@@ -153,7 +169,6 @@ void TestIntraPred32(VpxPredFunc const *pred_funcs) {
   }
 
 // -----------------------------------------------------------------------------
-// 4x4
 
 INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
                 vpx_dc_left_predictor_4x4_c, vpx_dc_top_predictor_4x4_c,
@@ -163,47 +178,6 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
                 vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c,
                 vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c)
 
-#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
-                vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
-                vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
-                vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
-                NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
-                vpx_tm_predictor_4x4_sse2)
-#endif  // HAVE_SSE2
-
-#if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_d153_predictor_4x4_ssse3, NULL,
-                vpx_d63_predictor_4x4_ssse3, NULL)
-#endif  // HAVE_SSSE3
-
-#if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL,
-                NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_tm_predictor_4x4_dspr2)
-#endif  // HAVE_DSPR2
-
-#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
-                vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
-                vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
-                vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
-                vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_4x4_neon)
-#endif  // HAVE_NEON
-
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa,
-                vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa,
-                vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa,
-                vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_4x4_msa)
-#endif  // HAVE_MSA
-
-// -----------------------------------------------------------------------------
-// 8x8
-
 INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
                 vpx_dc_left_predictor_8x8_c, vpx_dc_top_predictor_8x8_c,
                 vpx_dc_128_predictor_8x8_c, vpx_v_predictor_8x8_c,
@@ -212,46 +186,6 @@ INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
                 vpx_d153_predictor_8x8_c, vpx_d207_predictor_8x8_c,
                 vpx_d63_predictor_8x8_c, vpx_tm_predictor_8x8_c)
 
-#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
-                vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
-                vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
-                vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
-                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
-#endif  // HAVE_SSE2
-
-#if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_d153_predictor_8x8_ssse3,
-                vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL)
-#endif  // HAVE_SSSE3
-
-#if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL,
-                NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_tm_predictor_8x8_c)
-#endif  // HAVE_DSPR2
-
-#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
-                vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
-                vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
-                vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, NULL,
-                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_neon)
-
-#endif  // HAVE_NEON
-
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa,
-                vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa,
-                vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa,
-                vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_8x8_msa)
-#endif  // HAVE_MSA
-
-// -----------------------------------------------------------------------------
-// 16x16
-
 INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c,
                 vpx_dc_left_predictor_16x16_c, vpx_dc_top_predictor_16x16_c,
                 vpx_dc_128_predictor_16x16_c, vpx_v_predictor_16x16_c,
@@ -260,48 +194,6 @@ INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c,
                 vpx_d153_predictor_16x16_c, vpx_d207_predictor_16x16_c,
                 vpx_d63_predictor_16x16_c, vpx_tm_predictor_16x16_c)
 
-#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2,
-                vpx_dc_left_predictor_16x16_sse2,
-                vpx_dc_top_predictor_16x16_sse2,
-                vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2,
-                vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_16x16_sse2)
-#endif  // HAVE_SSE2
-
-#if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_d45_predictor_16x16_ssse3, NULL, NULL,
-                vpx_d153_predictor_16x16_ssse3, vpx_d207_predictor_16x16_ssse3,
-                vpx_d63_predictor_16x16_ssse3, NULL)
-#endif  // HAVE_SSSE3
-
-#if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL,
-                NULL, NULL, NULL, vpx_h_predictor_16x16_dspr2, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
-#endif  // HAVE_DSPR2
-
-#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
-                vpx_dc_left_predictor_16x16_neon,
-                vpx_dc_top_predictor_16x16_neon,
-                vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
-                vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, NULL,
-                NULL, NULL, NULL, NULL, vpx_tm_predictor_16x16_neon)
-#endif  // HAVE_NEON
-
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TestIntraPred16, vpx_dc_predictor_16x16_msa,
-                vpx_dc_left_predictor_16x16_msa, vpx_dc_top_predictor_16x16_msa,
-                vpx_dc_128_predictor_16x16_msa, vpx_v_predictor_16x16_msa,
-                vpx_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_16x16_msa)
-#endif  // HAVE_MSA
-
-// -----------------------------------------------------------------------------
-// 32x32
-
 INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
                 vpx_dc_left_predictor_32x32_c, vpx_dc_top_predictor_32x32_c,
                 vpx_dc_128_predictor_32x32_c, vpx_v_predictor_32x32_c,
@@ -311,6 +203,26 @@ INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
                 vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c)
 
 #if HAVE_SSE2
+INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
+                vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
+                vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
+                vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
+                NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
+                vpx_tm_predictor_4x4_sse2)
+
+INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
+                vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
+                vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
+                vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
+                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
+
+INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2,
+                vpx_dc_left_predictor_16x16_sse2,
+                vpx_dc_top_predictor_16x16_sse2,
+                vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2,
+                vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_16x16_sse2)
+
 INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
                 vpx_dc_left_predictor_32x32_sse2,
                 vpx_dc_top_predictor_32x32_sse2,
@@ -320,22 +232,79 @@ INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, vpx_d153_predictor_4x4_ssse3, NULL,
+                vpx_d63_predictor_4x4_ssse3, NULL)
+INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, vpx_d153_predictor_8x8_ssse3,
+                vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL)
+INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_d45_predictor_16x16_ssse3, NULL, NULL,
+                vpx_d153_predictor_16x16_ssse3, vpx_d207_predictor_16x16_ssse3,
+                vpx_d63_predictor_16x16_ssse3, NULL)
 INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL, NULL,
                 vpx_d45_predictor_32x32_ssse3, NULL, NULL,
                 vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3,
                 vpx_d63_predictor_32x32_ssse3, NULL)
 #endif  // HAVE_SSSE3
 
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL,
+                NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL,
+                NULL, NULL, vpx_tm_predictor_4x4_dspr2)
+INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL,
+                NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL,
+                NULL, NULL, vpx_tm_predictor_8x8_c)
+INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL,
+                NULL, NULL, NULL, vpx_h_predictor_16x16_dspr2, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL)
+#endif  // HAVE_DSPR2
+
 #if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
+                vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
+                vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
+                vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
+                vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_4x4_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
+                vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
+                vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
+                vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon,
+                vpx_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_8x8_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
+                vpx_dc_left_predictor_16x16_neon,
+                vpx_dc_top_predictor_16x16_neon,
+                vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
+                vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon,
+                vpx_d135_predictor_16x16_neon, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_16x16_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
                 vpx_dc_left_predictor_32x32_neon,
                 vpx_dc_top_predictor_32x32_neon,
                 vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
-                vpx_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon,
+                vpx_d135_predictor_32x32_neon, NULL, NULL, NULL, NULL,
                 vpx_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa,
+                vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa,
+                vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa,
+                vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_4x4_msa)
+INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa,
+                vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa,
+                vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa,
+                vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_8x8_msa)
+INTRA_PRED_TEST(MSA, TestIntraPred16, vpx_dc_predictor_16x16_msa,
+                vpx_dc_left_predictor_16x16_msa, vpx_dc_top_predictor_16x16_msa,
+                vpx_dc_128_predictor_16x16_msa, vpx_v_predictor_16x16_msa,
+                vpx_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_16x16_msa)
 INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa,
                 vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa,
                 vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa,
@@ -343,4 +312,275 @@ INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa,
                 vpx_tm_predictor_32x32_msa)
 #endif  // HAVE_MSA
 
+#if HAVE_VSX
+INTRA_PRED_TEST(VSX, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_4x4_vsx, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_4x4_vsx)
+
+INTRA_PRED_TEST(VSX, TestIntraPred8, vpx_dc_predictor_8x8_vsx, NULL, NULL, NULL,
+                NULL, vpx_h_predictor_8x8_vsx, vpx_d45_predictor_8x8_vsx, NULL,
+                NULL, NULL, NULL, vpx_d63_predictor_8x8_vsx,
+                vpx_tm_predictor_8x8_vsx)
+
+INTRA_PRED_TEST(VSX, TestIntraPred16, vpx_dc_predictor_16x16_vsx,
+                vpx_dc_left_predictor_16x16_vsx, vpx_dc_top_predictor_16x16_vsx,
+                vpx_dc_128_predictor_16x16_vsx, vpx_v_predictor_16x16_vsx,
+                vpx_h_predictor_16x16_vsx, vpx_d45_predictor_16x16_vsx, NULL,
+                NULL, NULL, NULL, vpx_d63_predictor_16x16_vsx,
+                vpx_tm_predictor_16x16_vsx)
+
+INTRA_PRED_TEST(VSX, TestIntraPred32, vpx_dc_predictor_32x32_vsx,
+                vpx_dc_left_predictor_32x32_vsx, vpx_dc_top_predictor_32x32_vsx,
+                vpx_dc_128_predictor_32x32_vsx, vpx_v_predictor_32x32_vsx,
+                vpx_h_predictor_32x32_vsx, vpx_d45_predictor_32x32_vsx, NULL,
+                NULL, NULL, NULL, vpx_d63_predictor_32x32_vsx,
+                vpx_tm_predictor_32x32_vsx)
+#endif  // HAVE_VSX
+
+// -----------------------------------------------------------------------------
+
+#if CONFIG_VP9_HIGHBITDEPTH
+namespace {
+
+typedef void (*VpxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride,
+                                  const uint16_t *above, const uint16_t *left,
+                                  int bd);
+
+typedef IntraPredTestMem<uint16_t> Vp9HighbdIntraPredTestMem;
+
+void TestHighbdIntraPred(const char name[], VpxHighbdPredFunc const *pred_funcs,
+                         const char *const signatures[], int block_size) {
+  const int kNumTests = static_cast<int>(
+      2.e10 / (block_size * block_size * kNumVp9IntraPredFuncs));
+  Vp9HighbdIntraPredTestMem intra_pred_test_mem;
+  const uint16_t *const above = intra_pred_test_mem.above_mem + 16;
+
+  intra_pred_test_mem.Init(block_size, 12);
+
+  for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) {
+    if (pred_funcs[k] == NULL) continue;
+    memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
+           sizeof(intra_pred_test_mem.src));
+    vpx_usec_timer timer;
+    vpx_usec_timer_start(&timer);
+    for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
+      pred_funcs[k](intra_pred_test_mem.src, kBPS, above,
+                    intra_pred_test_mem.left, 12);
+    }
+    libvpx_test::ClearSystemState();
+    vpx_usec_timer_mark(&timer);
+    const int elapsed_time =
+        static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+    CheckMd5Signature(name, signatures, intra_pred_test_mem.src,
+                      sizeof(intra_pred_test_mem.src), elapsed_time, k);
+  }
+}
+
+void TestHighbdIntraPred4(VpxHighbdPredFunc const *pred_funcs) {
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "11f74af6c5737df472f3275cbde062fa", "51bea056b6447c93f6eb8f6b7e8f6f71",
+    "27e97f946766331795886f4de04c5594", "53ab15974b049111fb596c5168ec7e3f",
+    "f0b640bb176fbe4584cf3d32a9b0320a", "729783ca909e03afd4b47111c80d967b",
+    "fbf1c30793d9f32812e4d9f905d53530", "293fc903254a33754133314c6cdba81f",
+    "f8074d704233e73dfd35b458c6092374", "aa6363d08544a1ec4da33d7a0be5640d",
+    "462abcfdfa3d087bb33c9a88f2aec491", "863eab65d22550dd44a2397277c1ec71",
+    "23d61df1574d0fa308f9731811047c4b"
+  };
+  TestHighbdIntraPred("Intra4", pred_funcs, kSignatures, 4);
+}
+
+void TestHighbdIntraPred8(VpxHighbdPredFunc const *pred_funcs) {
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "03da8829fe94663047fd108c5fcaa71d", "ecdb37b8120a2d3a4c706b016bd1bfd7",
+    "1d4543ed8d2b9368cb96898095fe8a75", "f791c9a67b913cbd82d9da8ecede30e2",
+    "065c70646f4dbaff913282f55a45a441", "51f87123616662ef7c35691497dfd0ba",
+    "2a5b0131ef4716f098ee65e6df01e3dd", "9ffe186a6bc7db95275f1bbddd6f7aba",
+    "a3258a2eae2e2bd55cb8f71351b22998", "8d909f0a2066e39b3216092c6289ece4",
+    "d183abb30b9f24c886a0517e991b22c7", "702a42fe4c7d665dc561b2aeeb60f311",
+    "7b5dbbbe7ae3a4ac2948731600bde5d6"
+  };
+  TestHighbdIntraPred("Intra8", pred_funcs, kSignatures, 8);
+}
+
+void TestHighbdIntraPred16(VpxHighbdPredFunc const *pred_funcs) {
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "e33cb3f56a878e2fddb1b2fc51cdd275", "c7bff6f04b6052c8ab335d726dbbd52d",
+    "d0b0b47b654a9bcc5c6008110a44589b", "78f5da7b10b2b9ab39f114a33b6254e9",
+    "c78e31d23831abb40d6271a318fdd6f3", "90d1347f4ec9198a0320daecb6ff90b8",
+    "d2c623746cbb64a0c9e29c10f2c57041", "cf28bd387b81ad3e5f1a1c779a4b70a0",
+    "24c304330431ddeaf630f6ce94af2eac", "91a329798036bf64e8e00a87b131b8b1",
+    "d39111f22885307f920796a42084c872", "e2e702f7250ece98dd8f3f2854c31eeb",
+    "e2fb05b01eb8b88549e85641d8ce5b59"
+  };
+  TestHighbdIntraPred("Intra16", pred_funcs, kSignatures, 16);
+}
+
+void TestHighbdIntraPred32(VpxHighbdPredFunc const *pred_funcs) {
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "a3e8056ba7e36628cce4917cd956fedd", "cc7d3024fe8748b512407edee045377e",
+    "2aab0a0f330a1d3e19b8ecb8f06387a3", "a547bc3fb7b06910bf3973122a426661",
+    "26f712514da95042f93d6e8dc8e431dc", "bb08c6e16177081daa3d936538dbc2e3",
+    "8f031af3e2650e89620d8d2c3a843d8b", "42867c8553285e94ee8e4df7abafbda8",
+    "6496bdee96100667833f546e1be3d640", "2ebfa25bf981377e682e580208504300",
+    "3e8ae52fd1f607f348aa4cb436c71ab7", "3d4efe797ca82193613696753ea624c4",
+    "cb8aab6d372278f3131e8d99efde02d9"
+  };
+  TestHighbdIntraPred("Intra32", pred_funcs, kSignatures, 32);
+}
+
+}  // namespace
+
+// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors
+// to |test_func|. The test name is 'arch.test_func', e.g., C.TestIntraPred4.
+#define HIGHBD_INTRA_PRED_TEST(arch, test_func, dc, dc_left, dc_top, dc_128,  \
+                               v, h, d45, d135, d117, d153, d207, d63, tm)    \
+  TEST(arch, test_func) {                                                     \
+    static const VpxHighbdPredFunc vpx_intra_pred[] = {                       \
+      dc, dc_left, dc_top, dc_128, v, h, d45, d135, d117, d153, d207, d63, tm \
+    };                                                                        \
+    test_func(vpx_intra_pred);                                                \
+  }
+
+// -----------------------------------------------------------------------------
+
+HIGHBD_INTRA_PRED_TEST(
+    C, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_c,
+    vpx_highbd_dc_left_predictor_4x4_c, vpx_highbd_dc_top_predictor_4x4_c,
+    vpx_highbd_dc_128_predictor_4x4_c, vpx_highbd_v_predictor_4x4_c,
+    vpx_highbd_h_predictor_4x4_c, vpx_highbd_d45_predictor_4x4_c,
+    vpx_highbd_d135_predictor_4x4_c, vpx_highbd_d117_predictor_4x4_c,
+    vpx_highbd_d153_predictor_4x4_c, vpx_highbd_d207_predictor_4x4_c,
+    vpx_highbd_d63_predictor_4x4_c, vpx_highbd_tm_predictor_4x4_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_c,
+    vpx_highbd_dc_left_predictor_8x8_c, vpx_highbd_dc_top_predictor_8x8_c,
+    vpx_highbd_dc_128_predictor_8x8_c, vpx_highbd_v_predictor_8x8_c,
+    vpx_highbd_h_predictor_8x8_c, vpx_highbd_d45_predictor_8x8_c,
+    vpx_highbd_d135_predictor_8x8_c, vpx_highbd_d117_predictor_8x8_c,
+    vpx_highbd_d153_predictor_8x8_c, vpx_highbd_d207_predictor_8x8_c,
+    vpx_highbd_d63_predictor_8x8_c, vpx_highbd_tm_predictor_8x8_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_c,
+    vpx_highbd_dc_left_predictor_16x16_c, vpx_highbd_dc_top_predictor_16x16_c,
+    vpx_highbd_dc_128_predictor_16x16_c, vpx_highbd_v_predictor_16x16_c,
+    vpx_highbd_h_predictor_16x16_c, vpx_highbd_d45_predictor_16x16_c,
+    vpx_highbd_d135_predictor_16x16_c, vpx_highbd_d117_predictor_16x16_c,
+    vpx_highbd_d153_predictor_16x16_c, vpx_highbd_d207_predictor_16x16_c,
+    vpx_highbd_d63_predictor_16x16_c, vpx_highbd_tm_predictor_16x16_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_c,
+    vpx_highbd_dc_left_predictor_32x32_c, vpx_highbd_dc_top_predictor_32x32_c,
+    vpx_highbd_dc_128_predictor_32x32_c, vpx_highbd_v_predictor_32x32_c,
+    vpx_highbd_h_predictor_32x32_c, vpx_highbd_d45_predictor_32x32_c,
+    vpx_highbd_d135_predictor_32x32_c, vpx_highbd_d117_predictor_32x32_c,
+    vpx_highbd_d153_predictor_32x32_c, vpx_highbd_d207_predictor_32x32_c,
+    vpx_highbd_d63_predictor_32x32_c, vpx_highbd_tm_predictor_32x32_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(
+    SSE2, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_sse2,
+    vpx_highbd_dc_left_predictor_4x4_sse2, vpx_highbd_dc_top_predictor_4x4_sse2,
+    vpx_highbd_dc_128_predictor_4x4_sse2, vpx_highbd_v_predictor_4x4_sse2,
+    vpx_highbd_h_predictor_4x4_sse2, NULL, vpx_highbd_d135_predictor_4x4_sse2,
+    vpx_highbd_d117_predictor_4x4_sse2, vpx_highbd_d153_predictor_4x4_sse2,
+    vpx_highbd_d207_predictor_4x4_sse2, vpx_highbd_d63_predictor_4x4_sse2,
+    vpx_highbd_tm_predictor_4x4_c)
+
+HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8,
+                       vpx_highbd_dc_predictor_8x8_sse2,
+                       vpx_highbd_dc_left_predictor_8x8_sse2,
+                       vpx_highbd_dc_top_predictor_8x8_sse2,
+                       vpx_highbd_dc_128_predictor_8x8_sse2,
+                       vpx_highbd_v_predictor_8x8_sse2,
+                       vpx_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2)
+
+HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16,
+                       vpx_highbd_dc_predictor_16x16_sse2,
+                       vpx_highbd_dc_left_predictor_16x16_sse2,
+                       vpx_highbd_dc_top_predictor_16x16_sse2,
+                       vpx_highbd_dc_128_predictor_16x16_sse2,
+                       vpx_highbd_v_predictor_16x16_sse2,
+                       vpx_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, vpx_highbd_tm_predictor_16x16_sse2)
+
+HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32,
+                       vpx_highbd_dc_predictor_32x32_sse2,
+                       vpx_highbd_dc_left_predictor_32x32_sse2,
+                       vpx_highbd_dc_top_predictor_32x32_sse2,
+                       vpx_highbd_dc_128_predictor_32x32_sse2,
+                       vpx_highbd_v_predictor_32x32_sse2,
+                       vpx_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, vpx_highbd_tm_predictor_32x32_sse2)
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_d45_predictor_4x4_ssse3, NULL,
+                       NULL, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_d45_predictor_8x8_ssse3,
+                       vpx_highbd_d135_predictor_8x8_ssse3,
+                       vpx_highbd_d117_predictor_8x8_ssse3,
+                       vpx_highbd_d153_predictor_8x8_ssse3,
+                       vpx_highbd_d207_predictor_8x8_ssse3,
+                       vpx_highbd_d63_predictor_8x8_ssse3, NULL)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_d45_predictor_16x16_ssse3,
+                       vpx_highbd_d135_predictor_16x16_ssse3,
+                       vpx_highbd_d117_predictor_16x16_ssse3,
+                       vpx_highbd_d153_predictor_16x16_ssse3,
+                       vpx_highbd_d207_predictor_16x16_ssse3,
+                       vpx_highbd_d63_predictor_16x16_ssse3, NULL)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_d45_predictor_32x32_ssse3,
+                       vpx_highbd_d135_predictor_32x32_ssse3,
+                       vpx_highbd_d117_predictor_32x32_ssse3,
+                       vpx_highbd_d153_predictor_32x32_ssse3,
+                       vpx_highbd_d207_predictor_32x32_ssse3,
+                       vpx_highbd_d63_predictor_32x32_ssse3, NULL)
+#endif  // HAVE_SSSE3
+
+#if HAVE_NEON
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon,
+    vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon,
+    vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
+    vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
+    vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+    vpx_highbd_tm_predictor_4x4_neon)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
+    vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon,
+    vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
+    vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
+    vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL,
+    vpx_highbd_tm_predictor_8x8_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16,
+                       vpx_highbd_dc_predictor_16x16_neon,
+                       vpx_highbd_dc_left_predictor_16x16_neon,
+                       vpx_highbd_dc_top_predictor_16x16_neon,
+                       vpx_highbd_dc_128_predictor_16x16_neon,
+                       vpx_highbd_v_predictor_16x16_neon,
+                       vpx_highbd_h_predictor_16x16_neon,
+                       vpx_highbd_d45_predictor_16x16_neon,
+                       vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL,
+                       NULL, vpx_highbd_tm_predictor_16x16_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32,
+                       vpx_highbd_dc_predictor_32x32_neon,
+                       vpx_highbd_dc_left_predictor_32x32_neon,
+                       vpx_highbd_dc_top_predictor_32x32_neon,
+                       vpx_highbd_dc_128_predictor_32x32_neon,
+                       vpx_highbd_v_predictor_32x32_neon,
+                       vpx_highbd_h_predictor_32x32_neon,
+                       vpx_highbd_d45_predictor_32x32_neon,
+                       vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL,
+                       NULL, vpx_highbd_tm_predictor_32x32_neon)
+#endif  // HAVE_NEON
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #include "test/test_libvpx.cc"
diff --git a/libs/libvpx/test/test_libvpx.cc b/libs/libvpx/test/test_libvpx.cc
index 8a70b4e28c..30641ae8c8 100644
--- a/libs/libvpx/test/test_libvpx.cc
+++ b/libs/libvpx/test/test_libvpx.cc
@@ -53,6 +53,9 @@ int main(int argc, char **argv) {
   }
   if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*");
   if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*");
+  if (!(simd_caps & HAS_AVX512)) {
+    append_negative_gtest_filter(":AVX512.*:AVX512/*");
+  }
 #endif  // ARCH_X86 || ARCH_X86_64
 
 #if !CONFIG_SHARED
diff --git a/libs/libvpx/test/test_vector_test.cc b/libs/libvpx/test/test_vector_test.cc
index 6d43157404..1879b3d277 100644
--- a/libs/libvpx/test/test_vector_test.cc
+++ b/libs/libvpx/test/test_vector_test.cc
@@ -28,13 +28,10 @@
 
 namespace {
 
-enum DecodeMode { kSerialMode, kFrameParallelMode };
+const int kThreads = 0;
+const int kFileName = 1;
 
-const int kDecodeMode = 0;
-const int kThreads = 1;
-const int kFileName = 2;
-
-typedef std::tr1::tuple<int, int, const char *> DecodeParam;
+typedef std::tr1::tuple<int, const char *> DecodeParam;
 
 class TestVectorTest : public ::libvpx_test::DecoderTest,
                        public ::libvpx_test::CodecTestWithParam<DecodeParam> {
@@ -53,8 +50,8 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 
   void OpenMD5File(const std::string &md5_file_name_) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
-                                   << md5_file_name_;
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
   virtual void DecompressedFrameHook(const vpx_image_t &img,
@@ -92,29 +89,14 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 TEST_P(TestVectorTest, MD5Match) {
   const DecodeParam input = GET_PARAM(1);
   const std::string filename = std::tr1::get<kFileName>(input);
-  const int threads = std::tr1::get<kThreads>(input);
-  const int mode = std::tr1::get<kDecodeMode>(input);
   vpx_codec_flags_t flags = 0;
   vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   char str[256];
 
-  if (mode == kFrameParallelMode) {
-    flags |= VPX_CODEC_USE_FRAME_THREADING;
-#if CONFIG_VP9_DECODER
-    // TODO(hkuang): Fix frame parallel decode bug. See issue 1086.
-    if (resize_clips_.find(filename) != resize_clips_.end()) {
-      printf("Skipping the test file: %s, due to frame parallel decode bug.\n",
-             filename.c_str());
-      return;
-    }
-#endif
-  }
+  cfg.threads = std::tr1::get<kThreads>(input);
 
-  cfg.threads = threads;
-
-  snprintf(str, sizeof(str) / sizeof(str[0]) - 1,
-           "file: %s  mode: %s threads: %d", filename.c_str(),
-           mode == 0 ? "Serial" : "Parallel", threads);
+  snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d",
+           filename.c_str(), cfg.threads);
   SCOPED_TRACE(str);
 
   // Open compressed video file.
@@ -145,38 +127,44 @@ TEST_P(TestVectorTest, MD5Match) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg));
 }
 
-// Test VP8 decode in serial mode with single thread.
-// NOTE: VP8 only support serial mode.
 #if CONFIG_VP8_DECODER
 VP8_INSTANTIATE_TEST_CASE(
     TestVectorTest,
     ::testing::Combine(
-        ::testing::Values(0),  // Serial Mode.
         ::testing::Values(1),  // Single thread.
         ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
                             libvpx_test::kVP8TestVectors +
                                 libvpx_test::kNumVP8TestVectors)));
+
+// Test VP8 decode in with different numbers of threads.
+INSTANTIATE_TEST_CASE_P(
+    VP8MultiThreaded, TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP8)),
+        ::testing::Combine(
+            ::testing::Range(2, 9),  // With 2 ~ 8 threads.
+            ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
+                                libvpx_test::kVP8TestVectors +
+                                    libvpx_test::kNumVP8TestVectors))));
+
 #endif  // CONFIG_VP8_DECODER
 
-// Test VP9 decode in serial mode with single thread.
 #if CONFIG_VP9_DECODER
 VP9_INSTANTIATE_TEST_CASE(
     TestVectorTest,
     ::testing::Combine(
-        ::testing::Values(0),  // Serial Mode.
         ::testing::Values(1),  // Single thread.
         ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                             libvpx_test::kVP9TestVectors +
                                 libvpx_test::kNumVP9TestVectors)));
 
-// Test VP9 decode in frame parallel mode with different number of threads.
 INSTANTIATE_TEST_CASE_P(
-    VP9MultiThreadedFrameParallel, TestVectorTest,
+    VP9MultiThreaded, TestVectorTest,
     ::testing::Combine(
         ::testing::Values(
             static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
         ::testing::Combine(
-            ::testing::Values(1),    // Frame Parallel mode.
             ::testing::Range(2, 9),  // With 2 ~ 8 threads.
             ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                                 libvpx_test::kVP9TestVectors +
diff --git a/libs/libvpx/test/test_vectors.cc b/libs/libvpx/test/test_vectors.cc
index 460c1f51b4..3ffc3efc41 100644
--- a/libs/libvpx/test/test_vectors.cc
+++ b/libs/libvpx/test/test_vectors.cc
@@ -371,9 +371,12 @@ const char *const kVP9TestVectors[] = {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   "vp90-2-20-big_superframe-01.webm",
   "vp90-2-20-big_superframe-02.webm",
+  "vp90-2-22-svc_1280x720_1.webm",
   RESIZE_TEST_VECTORS
 };
+const char *const kVP9TestVectorsSvc[] = { "vp90-2-22-svc_1280x720_3.ivf" };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
+const int kNumVP9TestVectorsSvc = NELEMENTS(kVP9TestVectorsSvc);
 const char *const kVP9TestVectorsResize[] = { RESIZE_TEST_VECTORS };
 const int kNumVP9TestVectorsResize = NELEMENTS(kVP9TestVectorsResize);
 #undef RESIZE_TEST_VECTORS
diff --git a/libs/libvpx/test/test_vectors.h b/libs/libvpx/test/test_vectors.h
index 2c6918abda..3df3e81133 100644
--- a/libs/libvpx/test/test_vectors.h
+++ b/libs/libvpx/test/test_vectors.h
@@ -23,6 +23,8 @@ extern const char *const kVP8TestVectors[];
 #if CONFIG_VP9_DECODER
 extern const int kNumVP9TestVectors;
 extern const char *const kVP9TestVectors[];
+extern const int kNumVP9TestVectorsSvc;
+extern const char *const kVP9TestVectorsSvc[];
 extern const int kNumVP9TestVectorsResize;
 extern const char *const kVP9TestVectorsResize[];
 #endif  // CONFIG_VP9_DECODER
diff --git a/libs/libvpx/test/twopass_encoder.sh b/libs/libvpx/test/twopass_encoder.sh
index 7a223f2afc..eaeaabdfd0 100755
--- a/libs/libvpx/test/twopass_encoder.sh
+++ b/libs/libvpx/test/twopass_encoder.sh
@@ -54,7 +54,10 @@ twopass_encoder_vp9() {
   fi
 }
 
-twopass_encoder_tests="twopass_encoder_vp8
-                       twopass_encoder_vp9"
 
-run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
+if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then
+  twopass_encoder_tests="twopass_encoder_vp8
+                         twopass_encoder_vp9"
+
+  run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
+fi
diff --git a/libs/libvpx/test/variance_test.cc b/libs/libvpx/test/variance_test.cc
index 6e31165faf..421024ad88 100644
--- a/libs/libvpx/test/variance_test.cc
+++ b/libs/libvpx/test/variance_test.cc
@@ -22,6 +22,7 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 namespace {
 
@@ -345,6 +346,7 @@ class MainTestClass
   void RefTest();
   void RefStrideTest();
   void OneQuarterTest();
+  void SpeedTest();
 
   // MSE/SSE tests
   void RefTestMse();
@@ -363,6 +365,7 @@ class MainTestClass
   int byte_shift() const { return params_.bit_depth - 8; }
   int block_size() const { return params_.block_size; }
   int width() const { return params_.width; }
+  int height() const { return params_.height; }
   uint32_t mask() const { return params_.mask; }
 };
 
@@ -471,6 +474,35 @@ void MainTestClass<VarianceFunctionType>::OneQuarterTest() {
   EXPECT_EQ(expected, var);
 }
 
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::SpeedTest() {
+  const int half = block_size() / 2;
+  if (!use_high_bit_depth()) {
+    memset(src_, 255, block_size());
+    memset(ref_, 255, half);
+    memset(ref_ + half, 0, half);
+#if CONFIG_VP9_HIGHBITDEPTH
+  } else {
+    vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half);
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+  unsigned int sse;
+
+  vpx_usec_timer timer;
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < (1 << 30) / block_size(); ++i) {
+    const uint32_t variance = params_.func(src_, width(), ref_, width(), &sse);
+    // Ignore return value.
+    (void)variance;
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("Variance %dx%d time: %5d ms\n", width(), height(),
+         elapsed_time / 1000);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Tests related to MSE / SSE.
 
@@ -529,46 +561,26 @@ void MainTestClass<FunctionType>::MaxTestSse() {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-using ::std::tr1::get;
-using ::std::tr1::make_tuple;
-using ::std::tr1::tuple;
-
-template <typename SubpelVarianceFunctionType>
+template <typename FunctionType>
 class SubpelVarianceTest
-    : public ::testing::TestWithParam<
-          tuple<int, int, SubpelVarianceFunctionType, int> > {
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
   virtual void SetUp() {
-    const tuple<int, int, SubpelVarianceFunctionType, int> &params =
-        this->GetParam();
-    log2width_ = get<0>(params);
-    width_ = 1 << log2width_;
-    log2height_ = get<1>(params);
-    height_ = 1 << log2height_;
-    subpel_variance_ = get<2>(params);
-    if (get<3>(params)) {
-      bit_depth_ = (vpx_bit_depth_t)get<3>(params);
-      use_high_bit_depth_ = true;
-    } else {
-      bit_depth_ = VPX_BITS_8;
-      use_high_bit_depth_ = false;
-    }
-    mask_ = (1 << bit_depth_) - 1;
+    params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
-    block_size_ = width_ * height_;
-    if (!use_high_bit_depth_) {
-      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
-      sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
-      ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+    if (!use_high_bit_depth()) {
+      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size()));
+      sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size()));
+      ref_ = new uint8_t[block_size() + width() + height() + 1];
 #if CONFIG_VP9_HIGHBITDEPTH
     } else {
       src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-          vpx_memalign(16, block_size_ * sizeof(uint16_t))));
+          vpx_memalign(16, block_size() * sizeof(uint16_t))));
       sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-          vpx_memalign(16, block_size_ * sizeof(uint16_t))));
-      ref_ =
-          CONVERT_TO_BYTEPTR(new uint16_t[block_size_ + width_ + height_ + 1]);
+          vpx_memalign(16, block_size() * sizeof(uint16_t))));
+      ref_ = CONVERT_TO_BYTEPTR(
+          new uint16_t[block_size() + width() + height() + 1]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     }
     ASSERT_TRUE(src_ != NULL);
@@ -577,7 +589,7 @@ class SubpelVarianceTest
   }
 
   virtual void TearDown() {
-    if (!use_high_bit_depth_) {
+    if (!use_high_bit_depth()) {
       vpx_free(src_);
       delete[] ref_;
       vpx_free(sec_);
@@ -599,42 +611,45 @@ class SubpelVarianceTest
   uint8_t *src_;
   uint8_t *ref_;
   uint8_t *sec_;
-  bool use_high_bit_depth_;
-  vpx_bit_depth_t bit_depth_;
-  int width_, log2width_;
-  int height_, log2height_;
-  int block_size_, mask_;
-  SubpelVarianceFunctionType subpel_variance_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+  int byte_shift() const { return params_.bit_depth - 8; }
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int height() const { return params_.height; }
+  uint32_t mask() const { return params_.mask; }
 };
 
 template <typename SubpelVarianceFunctionType>
 void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
   for (int x = 0; x < 8; ++x) {
     for (int y = 0; y < 8; ++y) {
-      if (!use_high_bit_depth_) {
-        for (int j = 0; j < block_size_; j++) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
           src_[j] = rnd_.Rand8();
         }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
           ref_[j] = rnd_.Rand8();
         }
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        for (int j = 0; j < block_size_; j++) {
-          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
         }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
       unsigned int sse1, sse2;
       unsigned int var1;
       ASM_REGISTER_STATE_CHECK(
-          var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
-      const unsigned int var2 =
-          subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,
-                              use_high_bit_depth_, bit_depth_);
+          var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+      const unsigned int var2 = subpel_variance_ref(
+          ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+          use_high_bit_depth(), params_.bit_depth);
       EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
       EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
     }
@@ -648,28 +663,28 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
   // Ref: Set the first half of values to the maximum, the second half to 0.
   for (int x = 0; x < 8; ++x) {
     for (int y = 0; y < 8; ++y) {
-      const int half = block_size_ / 2;
-      if (!use_high_bit_depth_) {
+      const int half = block_size() / 2;
+      if (!use_high_bit_depth()) {
         memset(src_, 0, half);
         memset(src_ + half, 255, half);
         memset(ref_, 255, half);
-        memset(ref_ + half, 0, half + width_ + height_ + 1);
+        memset(ref_ + half, 0, half + width() + height() + 1);
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half);
+        vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half);
         vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
         vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
-        vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_,
-                     half + width_ + height_ + 1);
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(),
+                     half + width() + height() + 1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
       unsigned int sse1, sse2;
       unsigned int var1;
       ASM_REGISTER_STATE_CHECK(
-          var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
-      const unsigned int var2 =
-          subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,
-                              use_high_bit_depth_, bit_depth_);
+          var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+      const unsigned int var2 = subpel_variance_ref(
+          ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+          use_high_bit_depth(), params_.bit_depth);
       EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
       EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
     }
@@ -680,33 +695,32 @@ template <>
 void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
   for (int x = 0; x < 8; ++x) {
     for (int y = 0; y < 8; ++y) {
-      if (!use_high_bit_depth_) {
-        for (int j = 0; j < block_size_; j++) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
           src_[j] = rnd_.Rand8();
           sec_[j] = rnd_.Rand8();
         }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
           ref_[j] = rnd_.Rand8();
         }
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        for (int j = 0; j < block_size_; j++) {
-          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
-          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask();
         }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
       uint32_t sse1, sse2;
       uint32_t var1, var2;
-      ASM_REGISTER_STATE_CHECK(var1 =
-                                   subpel_variance_(ref_, width_ + 1, x, y,
-                                                    src_, width_, &sse1, sec_));
-      var2 = subpel_avg_variance_ref(ref_, src_, sec_, log2width_, log2height_,
-                                     x, y, &sse2, use_high_bit_depth_,
-                                     static_cast<vpx_bit_depth_t>(bit_depth_));
+      ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y,
+                                                   src_, width(), &sse1, sec_));
+      var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width,
+                                     params_.log2height, x, y, &sse2,
+                                     use_high_bit_depth(), params_.bit_depth);
       EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
       EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
     }
@@ -727,6 +741,7 @@ TEST_P(VpxVarianceTest, Zero) { ZeroTest(); }
 TEST_P(VpxVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
 TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
@@ -765,37 +780,41 @@ INSTANTIATE_TEST_CASE_P(
                       VarianceParams(2, 3, &vpx_variance4x8_c),
                       VarianceParams(2, 2, &vpx_variance4x4_c)));
 
+typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
 
+typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef MainTestClass<VarianceMxNFunc> VpxHBDMseTest;
@@ -809,6 +828,7 @@ TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); }
 TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
 TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
@@ -816,18 +836,18 @@ TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
 /* TODO(debargha): This test does not support the highbd version
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDMseTest,
-    ::testing::Values(make_tuple(4, 4, &vpx_highbd_12_mse16x16_c),
-                      make_tuple(4, 4, &vpx_highbd_12_mse16x8_c),
-                      make_tuple(4, 4, &vpx_highbd_12_mse8x16_c),
-                      make_tuple(4, 4, &vpx_highbd_12_mse8x8_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse16x16_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse16x8_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse8x16_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse8x8_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse16x16_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse16x8_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse8x16_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse8x8_c)));
+    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c),
+                      MseParams(4, 4, &vpx_highbd_12_mse16x8_c),
+                      MseParams(4, 4, &vpx_highbd_12_mse8x16_c),
+                      MseParams(4, 4, &vpx_highbd_12_mse8x8_c),
+                      MseParams(4, 4, &vpx_highbd_10_mse16x16_c),
+                      MseParams(4, 4, &vpx_highbd_10_mse16x8_c),
+                      MseParams(4, 4, &vpx_highbd_10_mse8x16_c),
+                      MseParams(4, 4, &vpx_highbd_10_mse8x8_c),
+                      MseParams(4, 4, &vpx_highbd_8_mse16x16_c),
+                      MseParams(4, 4, &vpx_highbd_8_mse16x8_c),
+                      MseParams(4, 4, &vpx_highbd_8_mse8x16_c),
+                      MseParams(4, 4, &vpx_highbd_8_mse8x8_c)));
 */
 
 INSTANTIATE_TEST_CASE_P(
@@ -875,88 +895,161 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDSubpelVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
-        make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
-        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
-        make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
-        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
-        make_tuple(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
-        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12)));
+        SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
+        SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
+        SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
+        SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
+        SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
+        SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
+        SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
+        SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
+        SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
+        SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
+        SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
+        SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
+        SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c,
+                             10),
+        SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c,
+                             10),
+        SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c,
+                             10),
+        SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c,
+                             10),
+        SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c,
+                             10),
+        SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c,
+                             10),
+        SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c,
+                             10),
+        SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
+        SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
+        SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
+        SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
+        SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
+        SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
+        SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c,
+                             12),
+        SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c,
+                             12),
+        SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c,
+                             12),
+        SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c,
+                             12),
+        SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c,
+                             12),
+        SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c,
+                             12),
+        SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c,
+                             12),
+        SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
+        SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
+        SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
+        SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
+        SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
+        SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c,
+                             12)));
 
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8),
-        make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8),
-        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_c, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_c, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_c, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_c, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_c, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_c, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
-        make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
-        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_c, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_c, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_c, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_c, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_c, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_c, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
-        make_tuple(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
-        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12)));
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
+        SubpelAvgVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c,
+                                8),
+        SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c,
+                                8),
+        SubpelAvgVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c,
+                                8),
+        SubpelAvgVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c,
+                                8),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x64_c,
+                                10),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x32_c,
+                                10),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x64_c,
+                                10),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x32_c,
+                                10),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x16_c,
+                                10),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x32_c,
+                                10),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x16_c,
+                                10),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x8_c,
+                                10),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x16_c,
+                                10),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x64_c,
+                                12),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x32_c,
+                                12),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x64_c,
+                                12),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x32_c,
+                                12),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x16_c,
+                                12),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x32_c,
+                                12),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x16_c,
+                                12),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x8_c,
+                                12),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x16_c,
+                                12),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x4_c,
+                                12)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_SSE2
@@ -987,36 +1080,37 @@ INSTANTIATE_TEST_CASE_P(
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0),
-        make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0),
-        make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0),
-        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0),
-        make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0),
-        make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0),
-        make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0),
-        make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0),
-        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
-        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
-        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
-        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
-        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
 /* TODO(debargha): This test does not support the highbd version
@@ -1073,112 +1167,219 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxHBDSubpelVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, 12),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, 10),
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, 8)));
+        SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2,
+                             12),
+        SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2,
+                             12),
+        SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2,
+                             12),
+        SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2,
+                             12),
+        SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2,
+                             12),
+        SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2,
+                             12),
+        SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2,
+                             12),
+        SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2,
+                             12),
+        SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2,
+                             12),
+        SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2,
+                             12),
+        SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2,
+                             12),
+        SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2,
+                             10),
+        SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2,
+                             10),
+        SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2,
+                             10),
+        SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2,
+                             10),
+        SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2,
+                             10),
+        SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2,
+                             10),
+        SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2,
+                             10),
+        SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2,
+                             10),
+        SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2,
+                             10),
+        SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2,
+                             10),
+        SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2,
+                             10),
+        SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2,
+                             8),
+        SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2,
+                             8),
+        SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2,
+                             8),
+        SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2,
+                             8),
+        SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2,
+                             8),
+        SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2,
+                             8),
+        SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2,
+                             8),
+        SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2,
+                             8),
+        SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2,
+                             8),
+        SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8),
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2,
+                             8)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxHBDSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, 12),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, 10),
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, 8)));
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2,
+                                12),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2,
+                                12),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2,
+                                12),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2,
+                                12),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2,
+                                12),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2,
+                                12),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2,
+                                12),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2,
+                                12),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2,
+                                12),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2,
+                                12),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2,
+                                12),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2,
+                                10),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2,
+                                10),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2,
+                                10),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2,
+                                10),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2,
+                                10),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2,
+                                10),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2,
+                                10),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2,
+                                10),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2,
+                                10),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2,
+                                10),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2,
+                                10),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2,
+                                8),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2,
+                                8),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2,
+                                8),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2,
+                                8),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2,
+                                8),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2,
+                                8),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2,
+                                8),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2,
+                                8),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2,
+                                8),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2,
+                                8),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2,
+                                8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
 INSTANTIATE_TEST_CASE_P(
     SSSE3, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0)));
 
 INSTANTIATE_TEST_CASE_P(
     SSSE3, VpxSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, 0),
-        make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, 0),
-        make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, 0),
-        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, 0),
-        make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, 0),
-        make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, 0),
-        make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, 0),
-        make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0),
-        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0),
-        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0),
-        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
-        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
-        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, 0)));
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3,
+                                0)));
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
@@ -1195,14 +1396,16 @@ INSTANTIATE_TEST_CASE_P(
 
 INSTANTIATE_TEST_CASE_P(
     AVX2, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0)));
 
 INSTANTIATE_TEST_CASE_P(
     AVX2, VpxSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0),
-        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0)));
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2,
+                                0)));
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
@@ -1219,17 +1422,49 @@ INSTANTIATE_TEST_CASE_P(
                       VarianceParams(6, 5, &vpx_variance64x32_neon),
                       VarianceParams(5, 6, &vpx_variance32x64_neon),
                       VarianceParams(5, 5, &vpx_variance32x32_neon),
+                      VarianceParams(5, 4, &vpx_variance32x16_neon),
+                      VarianceParams(4, 5, &vpx_variance16x32_neon),
                       VarianceParams(4, 4, &vpx_variance16x16_neon),
                       VarianceParams(4, 3, &vpx_variance16x8_neon),
                       VarianceParams(3, 4, &vpx_variance8x16_neon),
-                      VarianceParams(3, 3, &vpx_variance8x8_neon)));
+                      VarianceParams(3, 3, &vpx_variance8x8_neon),
+                      VarianceParams(3, 2, &vpx_variance8x4_neon),
+                      VarianceParams(2, 3, &vpx_variance4x8_neon),
+                      VarianceParams(2, 2, &vpx_variance4x4_neon)));
 
 INSTANTIATE_TEST_CASE_P(
     NEON, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_neon, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_neon, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_neon, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_neon, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_neon, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_neon, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_neon, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_neon, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_neon, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_neon, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_neon, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_neon, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_neon, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_neon, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_neon, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_neon, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_neon, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_neon, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_neon, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_neon, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_neon, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_neon, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_neon, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_neon, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_neon, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
@@ -1264,34 +1499,103 @@ INSTANTIATE_TEST_CASE_P(
 
 INSTANTIATE_TEST_CASE_P(
     MSA, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_msa, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_msa, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_msa, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_msa, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_msa, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_msa, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_msa, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_msa, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_msa, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_msa, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_msa, 0),
-                      make_tuple(6, 6, &vpx_sub_pixel_variance64x64_msa, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_msa, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_msa, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_msa, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_msa, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_msa, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_msa, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_msa, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_msa, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_msa, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_msa, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_msa, 0),
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_msa, 0)));
 
 INSTANTIATE_TEST_CASE_P(
     MSA, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0)));
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0)));
 #endif  // HAVE_MSA
+
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(VSX, SumOfSquaresTest,
+                        ::testing::Values(vpx_get_mb_ss_vsx));
+
+INSTANTIATE_TEST_CASE_P(VSX, VpxSseTest,
+                        ::testing::Values(SseParams(2, 2,
+                                                    &vpx_get4x4sse_cs_vsx)));
+#endif  // HAVE_VSX
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, VpxMseTest,
+                        ::testing::Values(MseParams(4, 4, &vpx_mse16x16_mmi),
+                                          MseParams(4, 3, &vpx_mse16x8_mmi),
+                                          MseParams(3, 4, &vpx_mse8x16_mmi),
+                                          MseParams(3, 3, &vpx_mse8x8_mmi)));
+
+INSTANTIATE_TEST_CASE_P(
+    MMI, VpxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_mmi),
+                      VarianceParams(6, 5, &vpx_variance64x32_mmi),
+                      VarianceParams(5, 6, &vpx_variance32x64_mmi),
+                      VarianceParams(5, 5, &vpx_variance32x32_mmi),
+                      VarianceParams(5, 4, &vpx_variance32x16_mmi),
+                      VarianceParams(4, 5, &vpx_variance16x32_mmi),
+                      VarianceParams(4, 4, &vpx_variance16x16_mmi),
+                      VarianceParams(4, 3, &vpx_variance16x8_mmi),
+                      VarianceParams(3, 4, &vpx_variance8x16_mmi),
+                      VarianceParams(3, 3, &vpx_variance8x8_mmi),
+                      VarianceParams(3, 2, &vpx_variance8x4_mmi),
+                      VarianceParams(2, 3, &vpx_variance4x8_mmi),
+                      VarianceParams(2, 2, &vpx_variance4x4_mmi)));
+
+INSTANTIATE_TEST_CASE_P(
+    MMI, VpxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_mmi, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_mmi, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_mmi, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_mmi, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_mmi, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_mmi, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_mmi, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_mmi, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_mmi, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_mmi, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_mmi, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_mmi, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_mmi, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    MMI, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_mmi, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_mmi, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_mmi, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_mmi, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_mmi, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_mmi, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_mmi, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_mmi, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_mmi, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_mmi, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_mmi, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_mmi, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_mmi, 0)));
+#endif  // HAVE_MMI
 }  // namespace
diff --git a/libs/libvpx/test/video_source.h b/libs/libvpx/test/video_source.h
index 424bb2cfb4..54f692865b 100644
--- a/libs/libvpx/test/video_source.h
+++ b/libs/libvpx/test/video_source.h
@@ -13,7 +13,9 @@
 #if defined(_WIN32)
 #undef NOMINMAX
 #define NOMINMAX
+#ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 #endif
 #include <cstdio>
diff --git a/libs/libvpx/test/vp8_fdct4x4_test.cc b/libs/libvpx/test/vp8_fdct4x4_test.cc
index da4f0caa1e..b7697d8596 100644
--- a/libs/libvpx/test/vp8_fdct4x4_test.cc
+++ b/libs/libvpx/test/vp8_fdct4x4_test.cc
@@ -17,12 +17,16 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "./vpx_config.h"
 #include "./vp8_rtcd.h"
 #include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
 
 namespace {
 
+typedef void (*FdctFunc)(int16_t *a, int16_t *b, int a_stride);
+
 const int cospi8sqrt2minus1 = 20091;
 const int sinpi8sqrt2 = 35468;
 
@@ -68,10 +72,21 @@ void reference_idct4x4(const int16_t *input, int16_t *output) {
 
 using libvpx_test::ACMRandom;
 
-TEST(VP8FdctTest, SignBiasCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
+class FdctTest : public ::testing::TestWithParam<FdctFunc> {
+ public:
+  virtual void SetUp() {
+    fdct_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  FdctFunc fdct_func_;
+  ACMRandom rnd_;
+};
+
+TEST_P(FdctTest, SignBiasCheck) {
   int16_t test_input_block[16];
-  int16_t test_output_block[16];
+  DECLARE_ALIGNED(16, int16_t, test_output_block[16]);
   const int pitch = 8;
   int count_sign_block[16][2];
   const int count_test_block = 1000000;
@@ -81,10 +96,10 @@ TEST(VP8FdctTest, SignBiasCheck) {
   for (int i = 0; i < count_test_block; ++i) {
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 16; ++j) {
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = rnd_.Rand8() - rnd_.Rand8();
     }
 
-    vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+    fdct_func_(test_input_block, test_output_block, pitch);
 
     for (int j = 0; j < 16; ++j) {
       if (test_output_block[j] < 0) {
@@ -110,10 +125,10 @@ TEST(VP8FdctTest, SignBiasCheck) {
   for (int i = 0; i < count_test_block; ++i) {
     // Initialize a test block with input range [-15, 15].
     for (int j = 0; j < 16; ++j) {
-      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
+      test_input_block[j] = (rnd_.Rand8() >> 4) - (rnd_.Rand8() >> 4);
     }
 
-    vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+    fdct_func_(test_input_block, test_output_block, pitch);
 
     for (int j = 0; j < 16; ++j) {
       if (test_output_block[j] < 0) {
@@ -135,23 +150,22 @@ TEST(VP8FdctTest, SignBiasCheck) {
       << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]";
 };
 
-TEST(VP8FdctTest, RoundTripErrorCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
+TEST_P(FdctTest, RoundTripErrorCheck) {
   int max_error = 0;
   double total_error = 0;
   const int count_test_block = 1000000;
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[16];
-    int16_t test_temp_block[16];
     int16_t test_output_block[16];
+    DECLARE_ALIGNED(16, int16_t, test_temp_block[16]);
 
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 16; ++j) {
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = rnd_.Rand8() - rnd_.Rand8();
     }
 
     const int pitch = 8;
-    vp8_short_fdct4x4_c(test_input_block, test_temp_block, pitch);
+    fdct_func_(test_input_block, test_temp_block, pitch);
     reference_idct4x4(test_temp_block, test_output_block);
 
     for (int j = 0; j < 16; ++j) {
@@ -169,4 +183,24 @@ TEST(VP8FdctTest, RoundTripErrorCheck) {
       << "Error: FDCT/IDCT has average roundtrip error > 1 per block";
 };
 
+INSTANTIATE_TEST_CASE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, FdctTest,
+                        ::testing::Values(vp8_short_fdct4x4_neon));
+#endif  // HAVE_NEON
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, FdctTest,
+                        ::testing::Values(vp8_short_fdct4x4_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, FdctTest,
+                        ::testing::Values(vp8_short_fdct4x4_msa));
+#endif  // HAVE_MSA
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, FdctTest,
+                        ::testing::Values(vp8_short_fdct4x4_mmi));
+#endif  // HAVE_MMI
 }  // namespace
diff --git a/libs/libvpx/test/vp9_error_block_test.cc b/libs/libvpx/test/vp9_block_error_test.cc
similarity index 72%
rename from libs/libvpx/test/vp9_error_block_test.cc
rename to libs/libvpx/test/vp9_block_error_test.cc
index 74436c09e7..0b4d1df992 100644
--- a/libs/libvpx/test/vp9_error_block_test.cc
+++ b/libs/libvpx/test/vp9_block_error_test.cc
@@ -23,36 +23,36 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
-#if CONFIG_VP9_HIGHBITDEPTH
 const int kNumIterations = 1000;
 
-typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
+typedef int64_t (*HBDBlockErrorFunc)(const tran_low_t *coeff,
+                                     const tran_low_t *dqcoeff,
+                                     intptr_t block_size, int64_t *ssz,
+                                     int bps);
+
+typedef std::tr1::tuple<HBDBlockErrorFunc, HBDBlockErrorFunc, vpx_bit_depth_t>
+    BlockErrorParam;
+
+typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,
                                   const tran_low_t *dqcoeff,
-                                  intptr_t block_size, int64_t *ssz, int bps);
+                                  intptr_t block_size, int64_t *ssz);
 
-typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>
-    ErrorBlockParam;
-
-// wrapper for 8-bit block error functions without a 'bps' param.
-typedef int64_t (*HighBdBlockError8bit)(const tran_low_t *coeff,
-                                        const tran_low_t *dqcoeff,
-                                        intptr_t block_size, int64_t *ssz);
-template <HighBdBlockError8bit fn>
-int64_t HighBdBlockError8bitWrapper(const tran_low_t *coeff,
-                                    const tran_low_t *dqcoeff,
-                                    intptr_t block_size, int64_t *ssz,
-                                    int bps) {
-  EXPECT_EQ(8, bps);
+template <BlockErrorFunc fn>
+int64_t BlockError8BitWrapper(const tran_low_t *coeff,
+                              const tran_low_t *dqcoeff, intptr_t block_size,
+                              int64_t *ssz, int bps) {
+  EXPECT_EQ(bps, 8);
   return fn(coeff, dqcoeff, block_size, ssz);
 }
 
-class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
+class BlockErrorTest : public ::testing::TestWithParam<BlockErrorParam> {
  public:
-  virtual ~ErrorBlockTest() {}
+  virtual ~BlockErrorTest() {}
   virtual void SetUp() {
     error_block_op_ = GET_PARAM(0);
     ref_error_block_op_ = GET_PARAM(1);
@@ -63,11 +63,11 @@ class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
 
  protected:
   vpx_bit_depth_t bit_depth_;
-  ErrorBlockFunc error_block_op_;
-  ErrorBlockFunc ref_error_block_op_;
+  HBDBlockErrorFunc error_block_op_;
+  HBDBlockErrorFunc ref_error_block_op_;
 };
 
-TEST_P(ErrorBlockTest, OperationCheck) {
+TEST_P(BlockErrorTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
@@ -110,7 +110,7 @@ TEST_P(ErrorBlockTest, OperationCheck) {
       << "First failed at test case " << first_failure;
 }
 
-TEST_P(ErrorBlockTest, ExtremeValues) {
+TEST_P(BlockErrorTest, ExtremeValues) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
@@ -171,29 +171,28 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
 using std::tr1::make_tuple;
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, ErrorBlockTest,
-    ::testing::Values(
-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
-                   VPX_BITS_10),
-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
-                   VPX_BITS_12),
-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
-                   VPX_BITS_8),
-        make_tuple(
-            &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_sse2>,
-            &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_c>,
-            VPX_BITS_8)));
+const BlockErrorParam sse2_block_error_tests[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+             VPX_BITS_10),
+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+             VPX_BITS_12),
+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+             VPX_BITS_8),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&BlockError8BitWrapper<vp9_block_error_sse2>,
+             &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, BlockErrorTest,
+                        ::testing::ValuesIn(sse2_block_error_tests));
 #endif  // HAVE_SSE2
 
-#if HAVE_AVX
+#if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(
-    AVX, ErrorBlockTest,
-    ::testing::Values(make_tuple(
-        &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_avx>,
-        &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_c>,
-        VPX_BITS_8)));
-#endif  // HAVE_AVX
-
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+    AVX2, BlockErrorTest,
+    ::testing::Values(make_tuple(&BlockError8BitWrapper<vp9_block_error_avx2>,
+                                 &BlockError8BitWrapper<vp9_block_error_c>,
+                                 VPX_BITS_8)));
+#endif  // HAVE_AVX2
 }  // namespace
diff --git a/libs/libvpx/test/vp9_denoiser_sse2_test.cc b/libs/libvpx/test/vp9_denoiser_test.cc
similarity index 54%
rename from libs/libvpx/test/vp9_denoiser_sse2_test.cc
rename to libs/libvpx/test/vp9_denoiser_test.cc
index 2a50b77355..56ca257c59 100644
--- a/libs/libvpx/test/vp9_denoiser_sse2_test.cc
+++ b/libs/libvpx/test/vp9_denoiser_test.cc
@@ -29,11 +29,21 @@ using libvpx_test::ACMRandom;
 namespace {
 
 const int kNumPixels = 64 * 64;
-class VP9DenoiserTest : public ::testing::TestWithParam<BLOCK_SIZE> {
+
+typedef int (*Vp9DenoiserFilterFunc)(const uint8_t *sig, int sig_stride,
+                                     const uint8_t *mc_avg, int mc_avg_stride,
+                                     uint8_t *avg, int avg_stride,
+                                     int increase_denoising, BLOCK_SIZE bs,
+                                     int motion_magnitude);
+typedef std::tr1::tuple<Vp9DenoiserFilterFunc, BLOCK_SIZE> VP9DenoiserTestParam;
+
+class VP9DenoiserTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<VP9DenoiserTestParam> {
  public:
   virtual ~VP9DenoiserTest() {}
 
-  virtual void SetUp() { bs_ = GetParam(); }
+  virtual void SetUp() { bs_ = GET_PARAM(1); }
 
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
@@ -76,9 +86,9 @@ TEST_P(VP9DenoiserTest, BitexactCheck) {
                                                    64, avg_block_c, 64, 0, bs_,
                                                    motion_magnitude_random));
 
-    ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_sse2(
-        sig_block, 64, mc_avg_block, 64, avg_block_sse2, 64, 0, bs_,
-        motion_magnitude_random));
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(0)(sig_block, 64, mc_avg_block, 64,
+                                          avg_block_sse2, 64, 0, bs_,
+                                          motion_magnitude_random));
 
     // Test bitexactness.
     for (int h = 0; h < (4 << b_height_log2_lookup[bs_]); ++h) {
@@ -89,10 +99,36 @@ TEST_P(VP9DenoiserTest, BitexactCheck) {
   }
 }
 
+using std::tr1::make_tuple;
+
 // Test for all block size.
-INSTANTIATE_TEST_CASE_P(SSE2, VP9DenoiserTest,
-                        ::testing::Values(BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
-                                          BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
-                                          BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
-                                          BLOCK_64X64));
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9DenoiserTest,
+    ::testing::Values(make_tuple(&vp9_denoiser_filter_sse2, BLOCK_8X8),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_8X16),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X8),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X16),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X32),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X16),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X32),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X64),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X32),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X64)));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP9DenoiserTest,
+    ::testing::Values(make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X8),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X16),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X8),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X16),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X32),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X16),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X32),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X64),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X32),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X64)));
+#endif
 }  // namespace
diff --git a/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc b/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
index 53dc8c9fe4..62e8dcb9b5 100644
--- a/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
@@ -99,9 +99,7 @@ class VpxEncoderParmsGetToDecoder
     vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder();
     vpx_codec_alg_priv_t *const priv =
         reinterpret_cast<vpx_codec_alg_priv_t *>(vp9_decoder->priv);
-    FrameWorkerData *const worker_data =
-        reinterpret_cast<FrameWorkerData *>(priv->frame_workers[0].data1);
-    VP9_COMMON *const common = &worker_data->pbi->common;
+    VP9_COMMON *const common = &priv->pbi->common;
 
     if (encode_parms.lossless) {
       EXPECT_EQ(0, common->base_qindex);
diff --git a/libs/libvpx/test/vp9_ethread_test.cc b/libs/libvpx/test/vp9_ethread_test.cc
index 0590487fc0..6b7e512116 100644
--- a/libs/libvpx/test/vp9_ethread_test.cc
+++ b/libs/libvpx/test/vp9_ethread_test.cc
@@ -16,17 +16,221 @@
 #include "test/md5_helper.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
+#include "vp9/encoder/vp9_firstpass.h"
 
 namespace {
-class VPxEncoderThreadTest
+// FIRSTPASS_STATS struct:
+// {
+//   25 double members;
+//   1 int64_t member;
+// }
+// Whenever FIRSTPASS_STATS struct is modified, the following constants need to
+// be revisited.
+const int kDbl = 25;
+const int kInt = 1;
+const size_t kFirstPassStatsSz = kDbl * sizeof(double) + kInt * sizeof(int64_t);
+
+class VPxFirstPassEncoderThreadTest
     : public ::libvpx_test::EncoderTest,
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  VPxFirstPassEncoderThreadTest()
+      : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), tiles_(0),
+        encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) {
+    init_flags_ = VPX_CODEC_USE_PSNR;
+
+    row_mt_mode_ = 1;
+    first_pass_only_ = true;
+    firstpass_stats_.buf = NULL;
+    firstpass_stats_.sz = 0;
+  }
+  virtual ~VPxFirstPassEncoderThreadTest() { free(firstpass_stats_.buf); }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.rc_end_usage = VPX_VBR;
+    cfg_.rc_2pass_vbr_minsection_pct = 5;
+    cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    encoder_initialized_ = false;
+    abort_ = false;
+  }
+
+  virtual void EndPassHook() {
+    // For first pass stats test, only run first pass encoder.
+    if (first_pass_only_ && cfg_.g_pass == VPX_RC_FIRST_PASS)
+      abort_ |= first_pass_only_;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (!encoder_initialized_) {
+      // Encode in 2-pass mode.
+      encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_);
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0);
+
+      if (encoding_mode_ == ::libvpx_test::kTwoPassGood)
+        encoder->Control(VP9E_SET_ROW_MT, row_mt_mode_);
+
+      encoder_initialized_ = true;
+    }
+  }
+
+  virtual void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    const uint8_t *const pkt_buf =
+        reinterpret_cast<uint8_t *>(pkt->data.twopass_stats.buf);
+    const size_t pkt_size = pkt->data.twopass_stats.sz;
+
+    // First pass stats size equals sizeof(FIRSTPASS_STATS)
+    EXPECT_EQ(pkt_size, kFirstPassStatsSz)
+        << "Error: First pass stats size doesn't equal kFirstPassStatsSz";
+
+    firstpass_stats_.buf =
+        realloc(firstpass_stats_.buf, firstpass_stats_.sz + pkt_size);
+    memcpy((uint8_t *)firstpass_stats_.buf + firstpass_stats_.sz, pkt_buf,
+           pkt_size);
+    firstpass_stats_.sz += pkt_size;
+  }
+
+  bool encoder_initialized_;
+  int tiles_;
+  ::libvpx_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  int row_mt_mode_;
+  bool first_pass_only_;
+  vpx_fixed_buf_t firstpass_stats_;
+};
+
+static void compare_fp_stats(vpx_fixed_buf_t *fp_stats, double factor) {
+  // fp_stats consists of 2 set of first pass encoding stats. These 2 set of
+  // stats are compared to check if the stats match or at least are very close.
+  FIRSTPASS_STATS *stats1 = reinterpret_cast<FIRSTPASS_STATS *>(fp_stats->buf);
+  int nframes_ = (int)(fp_stats->sz / sizeof(FIRSTPASS_STATS));
+  FIRSTPASS_STATS *stats2 = stats1 + nframes_ / 2;
+  int i, j;
+
+  // The total stats are also output and included in the first pass stats. Here
+  // ignore that in the comparison.
+  for (i = 0; i < (nframes_ / 2 - 1); ++i) {
+    const double *frame_stats1 = reinterpret_cast<double *>(stats1);
+    const double *frame_stats2 = reinterpret_cast<double *>(stats2);
+
+    for (j = 0; j < kDbl; ++j) {
+      ASSERT_LE(fabs(*frame_stats1 - *frame_stats2),
+                fabs(*frame_stats1) / factor)
+          << "First failure @ frame #" << i << " stat #" << j << " ("
+          << *frame_stats1 << " vs. " << *frame_stats2 << ")";
+      frame_stats1++;
+      frame_stats2++;
+    }
+
+    stats1++;
+    stats2++;
+  }
+
+  // Reset firstpass_stats_ to 0.
+  memset((uint8_t *)fp_stats->buf, 0, fp_stats->sz);
+  fp_stats->sz = 0;
+}
+
+static void compare_fp_stats_md5(vpx_fixed_buf_t *fp_stats) {
+  // fp_stats consists of 2 set of first pass encoding stats. These 2 set of
+  // stats are compared to check if the stats match.
+  uint8_t *stats1 = reinterpret_cast<uint8_t *>(fp_stats->buf);
+  uint8_t *stats2 = stats1 + fp_stats->sz / 2;
+  ::libvpx_test::MD5 md5_row_mt_0, md5_row_mt_1;
+
+  md5_row_mt_0.Add(stats1, fp_stats->sz / 2);
+  const char *md5_row_mt_0_str = md5_row_mt_0.Get();
+
+  md5_row_mt_1.Add(stats2, fp_stats->sz / 2);
+  const char *md5_row_mt_1_str = md5_row_mt_1.Get();
+
+  // Check md5 match.
+  ASSERT_STREQ(md5_row_mt_0_str, md5_row_mt_1_str)
+      << "MD5 checksums don't match";
+
+  // Reset firstpass_stats_ to 0.
+  memset((uint8_t *)fp_stats->buf, 0, fp_stats->sz);
+  fp_stats->sz = 0;
+}
+
+TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+
+  first_pass_only_ = true;
+  cfg_.rc_target_bitrate = 1000;
+
+  // Test row_mt_mode: 0 vs 1 at single thread case(threads = 1, tiles_ = 0)
+  tiles_ = 0;
+  cfg_.g_threads = 1;
+
+  row_mt_mode_ = 0;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  row_mt_mode_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Compare to check if using or not using row-mt generates close stats.
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0));
+
+  // Test single thread vs multiple threads
+  row_mt_mode_ = 1;
+  tiles_ = 0;
+
+  cfg_.g_threads = 1;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  cfg_.g_threads = 4;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Compare to check if single-thread and multi-thread stats are close enough.
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0));
+
+  // Bit exact test in row_mt mode.
+  // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact
+  // result.
+  row_mt_mode_ = 1;
+  tiles_ = 2;
+
+  cfg_.g_threads = 2;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  cfg_.g_threads = 8;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Compare to check if stats match with row-mt=0/1.
+  compare_fp_stats_md5(&firstpass_stats_);
+}
+
+class VPxEncoderThreadTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith4Params<libvpx_test::TestMode, int,
+                                                 int, int> {
  protected:
   VPxEncoderThreadTest()
-      : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), tiles_(2),
+      : EncoderTest(GET_PARAM(0)), encoder_initialized_(false),
+        tiles_(GET_PARAM(3)), threads_(GET_PARAM(4)),
         encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) {
     init_flags_ = VPX_CODEC_USE_PSNR;
     md5_.clear();
+    row_mt_mode_ = 1;
+    psnr_ = 0.0;
+    nframes_ = 0;
   }
   virtual ~VPxEncoderThreadTest() {}
 
@@ -35,7 +239,6 @@ class VPxEncoderThreadTest
     SetMode(encoding_mode_);
 
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
-      cfg_.g_lag_in_frames = 3;
       cfg_.rc_end_usage = VPX_VBR;
       cfg_.rc_2pass_vbr_minsection_pct = 5;
       cfg_.rc_2pass_vbr_maxsection_pct = 2000;
@@ -50,6 +253,8 @@ class VPxEncoderThreadTest
 
   virtual void BeginPassHook(unsigned int /*pass*/) {
     encoder_initialized_ = false;
+    psnr_ = 0.0;
+    nframes_ = 0;
   }
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
@@ -63,14 +268,22 @@ class VPxEncoderThreadTest
         encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
         encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
         encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+        encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0);
       } else {
         encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0);
         encoder->Control(VP9E_SET_AQ_MODE, 3);
       }
+      encoder->Control(VP9E_SET_ROW_MT, row_mt_mode_);
+
       encoder_initialized_ = true;
     }
   }
 
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
   virtual void DecompressedFrameHook(const vpx_image_t &img,
                                      vpx_codec_pts_t /*pts*/) {
     ::libvpx_test::MD5 md5_res;
@@ -89,40 +302,127 @@ class VPxEncoderThreadTest
     return true;
   }
 
+  double GetAveragePsnr() const { return nframes_ ? (psnr_ / nframes_) : 0.0; }
+
   bool encoder_initialized_;
   int tiles_;
+  int threads_;
   ::libvpx_test::TestMode encoding_mode_;
   int set_cpu_used_;
+  int row_mt_mode_;
+  double psnr_;
+  unsigned int nframes_;
   std::vector<std::string> md5_;
 };
 
 TEST_P(VPxEncoderThreadTest, EncoderResultTest) {
-  std::vector<std::string> single_thr_md5, multi_thr_md5;
-
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 20);
-
   cfg_.rc_target_bitrate = 1000;
 
+  // Part 1: Bit exact test for row_mt_mode_ = 0.
+  // This part keeps original unit tests done before row-mt code is checked in.
+  row_mt_mode_ = 0;
+
   // Encode using single thread.
   cfg_.g_threads = 1;
   init_flags_ = VPX_CODEC_USE_PSNR;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  single_thr_md5 = md5_;
+  const std::vector<std::string> single_thr_md5 = md5_;
   md5_.clear();
 
   // Encode using multiple threads.
-  cfg_.g_threads = 4;
+  cfg_.g_threads = threads_;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  multi_thr_md5 = md5_;
+  const std::vector<std::string> multi_thr_md5 = md5_;
   md5_.clear();
 
   // Compare to check if two vectors are equal.
   ASSERT_EQ(single_thr_md5, multi_thr_md5);
+
+  // Part 2: row_mt_mode_ = 0 vs row_mt_mode_ = 1 single thread bit exact test.
+  row_mt_mode_ = 1;
+
+  // Encode using single thread
+  cfg_.g_threads = 1;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  std::vector<std::string> row_mt_single_thr_md5 = md5_;
+  md5_.clear();
+
+  ASSERT_EQ(single_thr_md5, row_mt_single_thr_md5);
+
+  // Part 3: Bit exact test with row-mt on
+  // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact
+  // result.
+  row_mt_mode_ = 1;
+  row_mt_single_thr_md5.clear();
+
+  // Encode using 2 threads.
+  cfg_.g_threads = 2;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  row_mt_single_thr_md5 = md5_;
+  md5_.clear();
+
+  // Encode using multiple threads.
+  cfg_.g_threads = threads_;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const std::vector<std::string> row_mt_multi_thr_md5 = md5_;
+  md5_.clear();
+
+  // Compare to check if two vectors are equal.
+  ASSERT_EQ(row_mt_single_thr_md5, row_mt_multi_thr_md5);
+
+  // Part 4: PSNR test with bit_match_mode_ = 0
+  row_mt_mode_ = 1;
+
+  // Encode using single thread.
+  cfg_.g_threads = 1;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double single_thr_psnr = GetAveragePsnr();
+
+  // Encode using multiple threads.
+  cfg_.g_threads = threads_;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double multi_thr_psnr = GetAveragePsnr();
+
+  EXPECT_NEAR(single_thr_psnr, multi_thr_psnr, 0.1);
 }
 
-VP9_INSTANTIATE_TEST_CASE(VPxEncoderThreadTest,
-                          ::testing::Values(::libvpx_test::kTwoPassGood,
-                                            ::libvpx_test::kOnePassGood,
-                                            ::libvpx_test::kRealTime),
-                          ::testing::Range(1, 9));
+INSTANTIATE_TEST_CASE_P(
+    VP9, VPxFirstPassEncoderThreadTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Values(::libvpx_test::kTwoPassGood),
+        ::testing::Range(0, 4)));  // cpu_used
+
+// Split this into two instantiations so that we can distinguish
+// between very slow runs ( ie cpu_speed 0 ) vs ones that can be
+// run nightly by adding Large to the title.
+INSTANTIATE_TEST_CASE_P(
+    VP9, VPxEncoderThreadTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Values(::libvpx_test::kTwoPassGood,
+                          ::libvpx_test::kOnePassGood,
+                          ::libvpx_test::kRealTime),
+        ::testing::Range(3, 9),    // cpu_used
+        ::testing::Range(0, 3),    // tile_columns
+        ::testing::Range(2, 5)));  // threads
+
+INSTANTIATE_TEST_CASE_P(
+    VP9Large, VPxEncoderThreadTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Values(::libvpx_test::kTwoPassGood,
+                          ::libvpx_test::kOnePassGood,
+                          ::libvpx_test::kRealTime),
+        ::testing::Range(0, 3),    // cpu_used
+        ::testing::Range(0, 3),    // tile_columns
+        ::testing::Range(2, 5)));  // threads
+
 }  // namespace
diff --git a/libs/libvpx/test/vp9_frame_parallel_test.cc b/libs/libvpx/test/vp9_frame_parallel_test.cc
deleted file mode 100644
index 670cd4d721..0000000000
--- a/libs/libvpx/test/vp9_frame_parallel_test.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
-#include "test/codec_factory.h"
-#include "test/decode_test_driver.h"
-#include "test/ivf_video_source.h"
-#include "test/md5_helper.h"
-#include "test/util.h"
-#if CONFIG_WEBM_IO
-#include "test/webm_video_source.h"
-#endif
-#include "vpx_mem/vpx_mem.h"
-
-namespace {
-
-using std::string;
-
-#if CONFIG_WEBM_IO
-
-struct PauseFileList {
-  const char *name;
-  // md5 sum for decoded frames which does not include skipped frames.
-  const char *expected_md5;
-  const int pause_frame_num;
-};
-
-// Decodes |filename| with |num_threads|. Pause at the specified frame_num,
-// seek to next key frame and then continue decoding until the end. Return
-// the md5 of the decoded frames which does not include skipped frames.
-string DecodeFileWithPause(const string &filename, int num_threads,
-                           int pause_num) {
-  libvpx_test::WebMVideoSource video(filename);
-  video.Init();
-  int in_frames = 0;
-  int out_frames = 0;
-
-  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
-  cfg.threads = num_threads;
-  vpx_codec_flags_t flags = 0;
-  flags |= VPX_CODEC_USE_FRAME_THREADING;
-  libvpx_test::VP9Decoder decoder(cfg, flags);
-
-  libvpx_test::MD5 md5;
-  video.Begin();
-
-  do {
-    ++in_frames;
-    const vpx_codec_err_t res =
-        decoder.DecodeFrame(video.cxdata(), video.frame_size());
-    if (res != VPX_CODEC_OK) {
-      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
-      break;
-    }
-
-    // Pause at specified frame number.
-    if (in_frames == pause_num) {
-      // Flush the decoder and then seek to next key frame.
-      decoder.DecodeFrame(NULL, 0);
-      video.SeekToNextKeyFrame();
-    } else {
-      video.Next();
-    }
-
-    // Flush the decoder at the end of the video.
-    if (!video.cxdata()) decoder.DecodeFrame(NULL, 0);
-
-    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const vpx_image_t *img;
-
-    // Get decompressed data
-    while ((img = dec_iter.Next())) {
-      ++out_frames;
-      md5.Add(img);
-    }
-  } while (video.cxdata() != NULL);
-
-  EXPECT_EQ(in_frames, out_frames)
-      << "Input frame count does not match output frame count";
-
-  return string(md5.Get());
-}
-
-void DecodeFilesWithPause(const PauseFileList files[]) {
-  for (const PauseFileList *iter = files; iter->name != NULL; ++iter) {
-    SCOPED_TRACE(iter->name);
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5,
-                DecodeFileWithPause(iter->name, t, iter->pause_frame_num))
-          << "threads = " << t;
-    }
-  }
-}
-
-TEST(VP9MultiThreadedFrameParallel, PauseSeekResume) {
-  // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
-  // one key frame for every ten frames.
-  static const PauseFileList files[] = {
-    { "vp90-2-07-frame_parallel-1.webm", "6ea7c3875d67252e7caf2bc6e75b36b1",
-      6 },
-    { "vp90-2-07-frame_parallel-1.webm", "4bb634160c7356a8d7d4299b6dc83a45",
-      12 },
-    { "vp90-2-07-frame_parallel-1.webm", "89772591e6ef461f9fa754f916c78ed8",
-      26 },
-    { NULL, NULL, 0 },
-  };
-  DecodeFilesWithPause(files);
-}
-
-struct FileList {
-  const char *name;
-  // md5 sum for decoded frames which does not include corrupted frames.
-  const char *expected_md5;
-  // Expected number of decoded frames which does not include corrupted frames.
-  const int expected_frame_count;
-};
-
-// Decodes |filename| with |num_threads|. Return the md5 of the decoded
-// frames which does not include corrupted frames.
-string DecodeFile(const string &filename, int num_threads,
-                  int expected_frame_count) {
-  libvpx_test::WebMVideoSource video(filename);
-  video.Init();
-
-  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
-  cfg.threads = num_threads;
-  const vpx_codec_flags_t flags = VPX_CODEC_USE_FRAME_THREADING;
-  libvpx_test::VP9Decoder decoder(cfg, flags);
-
-  libvpx_test::MD5 md5;
-  video.Begin();
-
-  int out_frames = 0;
-  do {
-    const vpx_codec_err_t res =
-        decoder.DecodeFrame(video.cxdata(), video.frame_size());
-    // TODO(hkuang): frame parallel mode should return an error on corruption.
-    if (res != VPX_CODEC_OK) {
-      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
-      break;
-    }
-
-    video.Next();
-
-    // Flush the decoder at the end of the video.
-    if (!video.cxdata()) decoder.DecodeFrame(NULL, 0);
-
-    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const vpx_image_t *img;
-
-    // Get decompressed data
-    while ((img = dec_iter.Next())) {
-      ++out_frames;
-      md5.Add(img);
-    }
-  } while (video.cxdata() != NULL);
-
-  EXPECT_EQ(expected_frame_count, out_frames)
-      << "Input frame count does not match expected output frame count";
-
-  return string(md5.Get());
-}
-
-void DecodeFiles(const FileList files[]) {
-  for (const FileList *iter = files; iter->name != NULL; ++iter) {
-    SCOPED_TRACE(iter->name);
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5,
-                DecodeFile(iter->name, t, iter->expected_frame_count))
-          << "threads = " << t;
-    }
-  }
-}
-
-TEST(VP9MultiThreadedFrameParallel, InvalidFileTest) {
-  static const FileList files[] = {
-    // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
-    // one key frame for every ten frames. The 11th frame has corrupted data.
-    { "invalid-vp90-2-07-frame_parallel-1.webm",
-      "0549d0f45f60deaef8eb708e6c0eb6cb", 30 },
-    // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with
-    // one key frame for every ten frames. The 1st and 31st frames have
-    // corrupted data.
-    { "invalid-vp90-2-07-frame_parallel-2.webm",
-      "6a1f3cf6f9e7a364212fadb9580d525e", 20 },
-    // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with
-    // one key frame for every ten frames. The 5th and 13th frames have
-    // corrupted data.
-    { "invalid-vp90-2-07-frame_parallel-3.webm",
-      "8256544308de926b0681e04685b98677", 27 },
-    { NULL, NULL, 0 },
-  };
-  DecodeFiles(files);
-}
-
-TEST(VP9MultiThreadedFrameParallel, ValidFileTest) {
-  static const FileList files[] = {
-#if CONFIG_VP9_HIGHBITDEPTH
-    { "vp92-2-20-10bit-yuv420.webm", "a16b99df180c584e8db2ffeda987d293", 10 },
-#endif
-    { NULL, NULL, 0 },
-  };
-  DecodeFiles(files);
-}
-#endif  // CONFIG_WEBM_IO
-}  // namespace
diff --git a/libs/libvpx/test/vp9_intrapred_test.cc b/libs/libvpx/test/vp9_intrapred_test.cc
index 1c4a3392e4..39c5e79ebd 100644
--- a/libs/libvpx/test/vp9_intrapred_test.cc
+++ b/libs/libvpx/test/vp9_intrapred_test.cc
@@ -28,25 +28,25 @@ using libvpx_test::ACMRandom;
 
 const int count_test_block = 100000;
 
-typedef void (*IntraPred)(uint16_t *dst, ptrdiff_t stride,
-                          const uint16_t *above, const uint16_t *left, int bps);
+typedef void (*IntraPredFunc)(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left);
 
-struct IntraPredFunc {
-  IntraPredFunc(IntraPred pred = NULL, IntraPred ref = NULL,
-                int block_size_value = 0, int bit_depth_value = 0)
+struct IntraPredParam {
+  IntraPredParam(IntraPredFunc pred = NULL, IntraPredFunc ref = NULL,
+                 int block_size_value = 0, int bit_depth_value = 0)
       : pred_fn(pred), ref_fn(ref), block_size(block_size_value),
         bit_depth(bit_depth_value) {}
 
-  IntraPred pred_fn;
-  IntraPred ref_fn;
+  IntraPredFunc pred_fn;
+  IntraPredFunc ref_fn;
   int block_size;
   int bit_depth;
 };
 
-class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
+template <typename Pixel, typename PredParam>
+class IntraPredTest : public ::testing::TestWithParam<PredParam> {
  public:
-  void RunTest(uint16_t *left_col, uint16_t *above_data, uint16_t *dst,
-               uint16_t *ref_dst) {
+  void RunTest(Pixel *left_col, Pixel *above_data, Pixel *dst, Pixel *ref_dst) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int block_size = params_.block_size;
     above_row_ = above_data + 16;
@@ -56,13 +56,16 @@ class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
     int error_count = 0;
     for (int i = 0; i < count_test_block; ++i) {
       // Fill edges with random data, try first with saturated values.
-      for (int x = -1; x <= block_size * 2; x++) {
+      for (int x = -1; x < block_size; x++) {
         if (i == 0) {
           above_row_[x] = mask_;
         } else {
           above_row_[x] = rnd.Rand16() & mask_;
         }
       }
+      for (int x = block_size; x < 2 * block_size; x++) {
+        above_row_[x] = above_row_[block_size - 1];
+      }
       for (int y = 0; y < block_size; y++) {
         if (i == 0) {
           left_col_[y] = mask_;
@@ -78,17 +81,12 @@ class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
 
  protected:
   virtual void SetUp() {
-    params_ = GetParam();
+    params_ = this->GetParam();
     stride_ = params_.block_size * 3;
     mask_ = (1 << params_.bit_depth) - 1;
   }
 
-  void Predict() {
-    const int bit_depth = params_.bit_depth;
-    params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
-    ASM_REGISTER_STATE_CHECK(
-        params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
-  }
+  void Predict();
 
   void CheckPrediction(int test_case_number, int *error_count) const {
     // For each pixel ensure that the calculated value is the same as reference.
@@ -104,17 +102,363 @@ class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
     }
   }
 
-  uint16_t *above_row_;
-  uint16_t *left_col_;
-  uint16_t *dst_;
-  uint16_t *ref_dst_;
+  Pixel *above_row_;
+  Pixel *left_col_;
+  Pixel *dst_;
+  Pixel *ref_dst_;
   ptrdiff_t stride_;
   int mask_;
 
-  IntraPredFunc params_;
+  PredParam params_;
 };
 
+template <>
+void IntraPredTest<uint8_t, IntraPredParam>::Predict() {
+  params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
+  ASM_REGISTER_STATE_CHECK(
+      params_.pred_fn(dst_, stride_, above_row_, left_col_));
+}
+
+typedef IntraPredTest<uint8_t, IntraPredParam> VP9IntraPredTest;
+
 TEST_P(VP9IntraPredTest, IntraPredTests) {
+  // max block size is 32
+  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]);
+  DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]);
+  RunTest(left_col, above_data, dst, ref_dst);
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9IntraPredTest,
+    ::testing::Values(
+        IntraPredParam(&vpx_d45_predictor_4x4_sse2, &vpx_d45_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_d45_predictor_8x8_sse2, &vpx_d45_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_d207_predictor_4x4_sse2, &vpx_d207_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_dc_128_predictor_4x4_sse2,
+                       &vpx_dc_128_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_128_predictor_8x8_sse2,
+                       &vpx_dc_128_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_128_predictor_16x16_sse2,
+                       &vpx_dc_128_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_128_predictor_32x32_sse2,
+                       &vpx_dc_128_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_left_predictor_4x4_sse2,
+                       &vpx_dc_left_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_left_predictor_8x8_sse2,
+                       &vpx_dc_left_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_left_predictor_16x16_sse2,
+                       &vpx_dc_left_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_left_predictor_32x32_sse2,
+                       &vpx_dc_left_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_predictor_4x4_sse2, &vpx_dc_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_8x8_sse2, &vpx_dc_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_16x16_sse2, &vpx_dc_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_dc_predictor_32x32_sse2, &vpx_dc_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_dc_top_predictor_4x4_sse2,
+                       &vpx_dc_top_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_top_predictor_8x8_sse2,
+                       &vpx_dc_top_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_top_predictor_16x16_sse2,
+                       &vpx_dc_top_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_top_predictor_32x32_sse2,
+                       &vpx_dc_top_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_h_predictor_4x4_sse2, &vpx_h_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_h_predictor_8x8_sse2, &vpx_h_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_h_predictor_16x16_sse2, &vpx_h_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_h_predictor_32x32_sse2, &vpx_h_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_tm_predictor_4x4_sse2, &vpx_tm_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_8x8_sse2, &vpx_tm_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_16x16_sse2, &vpx_tm_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_tm_predictor_32x32_sse2, &vpx_tm_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_v_predictor_4x4_sse2, &vpx_v_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_v_predictor_8x8_sse2, &vpx_v_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_v_predictor_16x16_sse2, &vpx_v_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_v_predictor_32x32_sse2, &vpx_v_predictor_32x32_c,
+                       32, 8)));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, VP9IntraPredTest,
+    ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_ssse3,
+                                     &vpx_d45_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_d45_predictor_32x32_ssse3,
+                                     &vpx_d45_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_d63_predictor_4x4_ssse3,
+                                     &vpx_d63_predictor_4x4_c, 4, 8),
+                      IntraPredParam(&vpx_d63_predictor_8x8_ssse3,
+                                     &vpx_d63_predictor_8x8_c, 8, 8),
+                      IntraPredParam(&vpx_d63_predictor_16x16_ssse3,
+                                     &vpx_d63_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_d63_predictor_32x32_ssse3,
+                                     &vpx_d63_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_d153_predictor_4x4_ssse3,
+                                     &vpx_d153_predictor_4x4_c, 4, 8),
+                      IntraPredParam(&vpx_d153_predictor_8x8_ssse3,
+                                     &vpx_d153_predictor_8x8_c, 8, 8),
+                      IntraPredParam(&vpx_d153_predictor_16x16_ssse3,
+                                     &vpx_d153_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_d153_predictor_32x32_ssse3,
+                                     &vpx_d153_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_d207_predictor_8x8_ssse3,
+                                     &vpx_d207_predictor_8x8_c, 8, 8),
+                      IntraPredParam(&vpx_d207_predictor_16x16_ssse3,
+                                     &vpx_d207_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_d207_predictor_32x32_ssse3,
+                                     &vpx_d207_predictor_32x32_c, 32, 8)));
+#endif  // HAVE_SSSE3
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP9IntraPredTest,
+    ::testing::Values(
+        IntraPredParam(&vpx_d45_predictor_4x4_neon, &vpx_d45_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_d45_predictor_8x8_neon, &vpx_d45_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_d45_predictor_16x16_neon,
+                       &vpx_d45_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d45_predictor_32x32_neon,
+                       &vpx_d45_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d135_predictor_16x16_neon,
+                       &vpx_d135_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d135_predictor_32x32_neon,
+                       &vpx_d135_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_128_predictor_4x4_neon,
+                       &vpx_dc_128_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_128_predictor_8x8_neon,
+                       &vpx_dc_128_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_128_predictor_16x16_neon,
+                       &vpx_dc_128_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_128_predictor_32x32_neon,
+                       &vpx_dc_128_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_left_predictor_4x4_neon,
+                       &vpx_dc_left_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_left_predictor_8x8_neon,
+                       &vpx_dc_left_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_left_predictor_16x16_neon,
+                       &vpx_dc_left_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_left_predictor_32x32_neon,
+                       &vpx_dc_left_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_predictor_4x4_neon, &vpx_dc_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_8x8_neon, &vpx_dc_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_16x16_neon, &vpx_dc_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_dc_predictor_32x32_neon, &vpx_dc_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_dc_top_predictor_4x4_neon,
+                       &vpx_dc_top_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_top_predictor_8x8_neon,
+                       &vpx_dc_top_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_top_predictor_16x16_neon,
+                       &vpx_dc_top_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_top_predictor_32x32_neon,
+                       &vpx_dc_top_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_h_predictor_4x4_neon, &vpx_h_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_h_predictor_8x8_neon, &vpx_h_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_h_predictor_16x16_neon, &vpx_h_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_h_predictor_32x32_neon, &vpx_h_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_tm_predictor_4x4_neon, &vpx_tm_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_8x8_neon, &vpx_tm_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_16x16_neon, &vpx_tm_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_tm_predictor_32x32_neon, &vpx_tm_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_v_predictor_4x4_neon, &vpx_v_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_v_predictor_8x8_neon, &vpx_v_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_v_predictor_16x16_neon, &vpx_v_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_v_predictor_32x32_neon, &vpx_v_predictor_32x32_c,
+                       32, 8)));
+#endif  // HAVE_NEON
+
+#if HAVE_DSPR2
+INSTANTIATE_TEST_CASE_P(
+    DSPR2, VP9IntraPredTest,
+    ::testing::Values(IntraPredParam(&vpx_dc_predictor_4x4_dspr2,
+                                     &vpx_dc_predictor_4x4_c, 4, 8),
+                      IntraPredParam(&vpx_dc_predictor_8x8_dspr2,
+                                     &vpx_dc_predictor_8x8_c, 8, 8),
+                      IntraPredParam(&vpx_dc_predictor_16x16_dspr2,
+                                     &vpx_dc_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_h_predictor_4x4_dspr2,
+                                     &vpx_h_predictor_4x4_c, 4, 8),
+                      IntraPredParam(&vpx_h_predictor_8x8_dspr2,
+                                     &vpx_h_predictor_8x8_c, 8, 8),
+                      IntraPredParam(&vpx_h_predictor_16x16_dspr2,
+                                     &vpx_h_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_tm_predictor_4x4_dspr2,
+                                     &vpx_tm_predictor_4x4_c, 4, 8),
+                      IntraPredParam(&vpx_tm_predictor_8x8_dspr2,
+                                     &vpx_tm_predictor_8x8_c, 8, 8)));
+#endif  // HAVE_DSPR2
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+    MSA, VP9IntraPredTest,
+    ::testing::Values(
+        IntraPredParam(&vpx_dc_128_predictor_4x4_msa,
+                       &vpx_dc_128_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_128_predictor_8x8_msa,
+                       &vpx_dc_128_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_128_predictor_16x16_msa,
+                       &vpx_dc_128_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_128_predictor_32x32_msa,
+                       &vpx_dc_128_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_left_predictor_4x4_msa,
+                       &vpx_dc_left_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_left_predictor_8x8_msa,
+                       &vpx_dc_left_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_left_predictor_16x16_msa,
+                       &vpx_dc_left_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_left_predictor_32x32_msa,
+                       &vpx_dc_left_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_predictor_4x4_msa, &vpx_dc_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_8x8_msa, &vpx_dc_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_16x16_msa, &vpx_dc_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_dc_predictor_32x32_msa, &vpx_dc_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_dc_top_predictor_4x4_msa,
+                       &vpx_dc_top_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_top_predictor_8x8_msa,
+                       &vpx_dc_top_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_top_predictor_16x16_msa,
+                       &vpx_dc_top_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_top_predictor_32x32_msa,
+                       &vpx_dc_top_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_h_predictor_4x4_msa, &vpx_h_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_h_predictor_8x8_msa, &vpx_h_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_h_predictor_16x16_msa, &vpx_h_predictor_16x16_c, 16,
+                       8),
+        IntraPredParam(&vpx_h_predictor_32x32_msa, &vpx_h_predictor_32x32_c, 32,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_4x4_msa, &vpx_tm_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_8x8_msa, &vpx_tm_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_16x16_msa, &vpx_tm_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_tm_predictor_32x32_msa, &vpx_tm_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_v_predictor_4x4_msa, &vpx_v_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_v_predictor_8x8_msa, &vpx_v_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_v_predictor_16x16_msa, &vpx_v_predictor_16x16_c, 16,
+                       8),
+        IntraPredParam(&vpx_v_predictor_32x32_msa, &vpx_v_predictor_32x32_c, 32,
+                       8)));
+#endif  // HAVE_MSA
+
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(
+    VSX, VP9IntraPredTest,
+    ::testing::Values(
+        IntraPredParam(&vpx_d45_predictor_8x8_vsx, &vpx_d45_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_d45_predictor_16x16_vsx, &vpx_d45_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_d45_predictor_32x32_vsx, &vpx_d45_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_d63_predictor_8x8_vsx, &vpx_d63_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_d63_predictor_16x16_vsx, &vpx_d63_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_d63_predictor_32x32_vsx, &vpx_d63_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_dc_128_predictor_16x16_vsx,
+                       &vpx_dc_128_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_128_predictor_32x32_vsx,
+                       &vpx_dc_128_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_left_predictor_16x16_vsx,
+                       &vpx_dc_left_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_left_predictor_32x32_vsx,
+                       &vpx_dc_left_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_predictor_8x8_vsx, &vpx_dc_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_16x16_vsx, &vpx_dc_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_dc_predictor_32x32_vsx, &vpx_dc_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_dc_top_predictor_16x16_vsx,
+                       &vpx_dc_top_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_top_predictor_32x32_vsx,
+                       &vpx_dc_top_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_h_predictor_4x4_vsx, &vpx_h_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_h_predictor_8x8_vsx, &vpx_h_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_h_predictor_16x16_vsx, &vpx_h_predictor_16x16_c, 16,
+                       8),
+        IntraPredParam(&vpx_h_predictor_32x32_vsx, &vpx_h_predictor_32x32_c, 32,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_4x4_vsx, &vpx_tm_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_8x8_vsx, &vpx_tm_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_16x16_vsx, &vpx_tm_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_tm_predictor_32x32_vsx, &vpx_tm_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_v_predictor_16x16_vsx, &vpx_v_predictor_16x16_c, 16,
+                       8),
+        IntraPredParam(&vpx_v_predictor_32x32_vsx, &vpx_v_predictor_32x32_c, 32,
+                       8)));
+#endif  // HAVE_VSX
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
+                                const uint16_t *above, const uint16_t *left,
+                                int bps);
+struct HighbdIntraPredParam {
+  HighbdIntraPredParam(HighbdIntraPred pred = NULL, HighbdIntraPred ref = NULL,
+                       int block_size_value = 0, int bit_depth_value = 0)
+      : pred_fn(pred), ref_fn(ref), block_size(block_size_value),
+        bit_depth(bit_depth_value) {}
+
+  HighbdIntraPred pred_fn;
+  HighbdIntraPred ref_fn;
+  int block_size;
+  int bit_depth;
+};
+
+template <>
+void IntraPredTest<uint16_t, HighbdIntraPredParam>::Predict() {
+  const int bit_depth = params_.bit_depth;
+  params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+  ASM_REGISTER_STATE_CHECK(
+      params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
+}
+
+typedef IntraPredTest<uint16_t, HighbdIntraPredParam> VP9HighbdIntraPredTest;
+
+TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) {
   // max block size is 32
   DECLARE_ALIGNED(16, uint16_t, left_col[2 * 32]);
   DECLARE_ALIGNED(16, uint16_t, above_data[2 * 32 + 32]);
@@ -123,89 +467,575 @@ TEST_P(VP9IntraPredTest, IntraPredTests) {
   RunTest(left_col, above_data, dst, ref_dst);
 }
 
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_TO_C_8, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+                             &vpx_highbd_d63_predictor_32x32_ssse3, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+                             &vpx_highbd_d117_predictor_32x32_ssse3, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 8)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_TO_C_10, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+                             &vpx_highbd_d63_predictor_32x32_ssse3, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+                             &vpx_highbd_d117_predictor_32x32_ssse3, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 10)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_TO_C_12, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+                             &vpx_highbd_d63_predictor_32x32_ssse3, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+                             &vpx_highbd_d117_predictor_32x32_ssse3, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 12)));
+#endif  // HAVE_SSSE3
+
 #if HAVE_SSE2
-#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
-    SSE2_TO_C_8, VP9IntraPredTest,
-    ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2,
-                                    &vpx_highbd_dc_predictor_32x32_c, 32, 8),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2,
-                                    &vpx_highbd_tm_predictor_16x16_c, 16, 8),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2,
-                                    &vpx_highbd_tm_predictor_32x32_c, 32, 8),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2,
-                                    &vpx_highbd_dc_predictor_4x4_c, 4, 8),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2,
-                                    &vpx_highbd_dc_predictor_8x8_c, 8, 8),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2,
-                                    &vpx_highbd_dc_predictor_16x16_c, 16, 8),
-                      IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2,
-                                    &vpx_highbd_v_predictor_4x4_c, 4, 8),
-                      IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2,
-                                    &vpx_highbd_v_predictor_8x8_c, 8, 8),
-                      IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2,
-                                    &vpx_highbd_v_predictor_16x16_c, 16, 8),
-                      IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2,
-                                    &vpx_highbd_v_predictor_32x32_c, 32, 8),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2,
-                                    &vpx_highbd_tm_predictor_4x4_c, 4, 8),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2,
-                                    &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
+    SSE2_TO_C_8, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 8)));
 
 INSTANTIATE_TEST_CASE_P(
-    SSE2_TO_C_10, VP9IntraPredTest,
-    ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2,
-                                    &vpx_highbd_dc_predictor_32x32_c, 32, 10),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2,
-                                    &vpx_highbd_tm_predictor_16x16_c, 16, 10),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2,
-                                    &vpx_highbd_tm_predictor_32x32_c, 32, 10),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2,
-                                    &vpx_highbd_dc_predictor_4x4_c, 4, 10),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2,
-                                    &vpx_highbd_dc_predictor_8x8_c, 8, 10),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2,
-                                    &vpx_highbd_dc_predictor_16x16_c, 16, 10),
-                      IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2,
-                                    &vpx_highbd_v_predictor_4x4_c, 4, 10),
-                      IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2,
-                                    &vpx_highbd_v_predictor_8x8_c, 8, 10),
-                      IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2,
-                                    &vpx_highbd_v_predictor_16x16_c, 16, 10),
-                      IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2,
-                                    &vpx_highbd_v_predictor_32x32_c, 32, 10),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2,
-                                    &vpx_highbd_tm_predictor_4x4_c, 4, 10),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2,
-                                    &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
+    SSE2_TO_C_10, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 10)));
 
 INSTANTIATE_TEST_CASE_P(
-    SSE2_TO_C_12, VP9IntraPredTest,
-    ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2,
-                                    &vpx_highbd_dc_predictor_32x32_c, 32, 12),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2,
-                                    &vpx_highbd_tm_predictor_16x16_c, 16, 12),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2,
-                                    &vpx_highbd_tm_predictor_32x32_c, 32, 12),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2,
-                                    &vpx_highbd_dc_predictor_4x4_c, 4, 12),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2,
-                                    &vpx_highbd_dc_predictor_8x8_c, 8, 12),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2,
-                                    &vpx_highbd_dc_predictor_16x16_c, 16, 12),
-                      IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2,
-                                    &vpx_highbd_v_predictor_4x4_c, 4, 12),
-                      IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2,
-                                    &vpx_highbd_v_predictor_8x8_c, 8, 12),
-                      IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2,
-                                    &vpx_highbd_v_predictor_16x16_c, 16, 12),
-                      IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2,
-                                    &vpx_highbd_v_predictor_32x32_c, 32, 12),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2,
-                                    &vpx_highbd_tm_predictor_4x4_c, 4, 12),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2,
-                                    &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
+    SSE2_TO_C_12, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 12)));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON_TO_C_8, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 8)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON_TO_C_10, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 10)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON_TO_C_12, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 12)));
+#endif  // HAVE_NEON
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // HAVE_SSE2
 }  // namespace
diff --git a/libs/libvpx/test/vp9_motion_vector_test.cc b/libs/libvpx/test/vp9_motion_vector_test.cc
new file mode 100644
index 0000000000..1030204ae3
--- /dev/null
+++ b/libs/libvpx/test/vp9_motion_vector_test.cc
@@ -0,0 +1,97 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+#define MAX_EXTREME_MV 1
+#define MIN_EXTREME_MV 2
+
+// Encoding modes
+const libvpx_test::TestMode kEncodingModeVectors[] = {
+  ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
+  ::libvpx_test::kRealTime,
+};
+
+// Encoding speeds
+const int kCpuUsedVectors[] = { 0, 1, 2, 3, 4, 5, 6 };
+
+// MV test modes: 1 - always use maximum MV; 2 - always use minimum MV.
+const int kMVTestModes[] = { MAX_EXTREME_MV, MIN_EXTREME_MV };
+
+class MotionVectorTestLarge
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith3Params<libvpx_test::TestMode, int,
+                                                 int> {
+ protected:
+  MotionVectorTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {}
+
+  virtual ~MotionVectorTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 3;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      encoder->Control(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
+    }
+  }
+
+  libvpx_test::TestMode encoding_mode_;
+  int cpu_used_;
+  int mv_test_mode_;
+};
+
+TEST_P(MotionVectorTestLarge, OverallTest) {
+  cfg_.rc_target_bitrate = 24000;
+  cfg_.g_profile = 0;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  testing::internal::scoped_ptr<libvpx_test::VideoSource> video;
+  video.reset(new libvpx_test::YUVVideoSource(
+      "niklas_640_480_30.yuv", VPX_IMG_FMT_I420, 3840, 2160,  // 2048, 1080,
+      30, 1, 0, 5));
+
+  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+VP9_INSTANTIATE_TEST_CASE(MotionVectorTestLarge,
+                          ::testing::ValuesIn(kEncodingModeVectors),
+                          ::testing::ValuesIn(kCpuUsedVectors),
+                          ::testing::ValuesIn(kMVTestModes));
+}  // namespace
diff --git a/libs/libvpx/test/vp9_quantize_test.cc b/libs/libvpx/test/vp9_quantize_test.cc
index 4643895021..b18d4522ce 100644
--- a/libs/libvpx/test/vp9_quantize_test.cc
+++ b/libs/libvpx/test/vp9_quantize_test.cc
@@ -14,9 +14,11 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/buffer.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -24,11 +26,12 @@
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
 
 namespace {
-#if CONFIG_VP9_HIGHBITDEPTH
 const int number_of_iterations = 100;
 
 typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
@@ -38,307 +41,494 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
                              tran_low_t *dqcoeff, const int16_t *dequant,
                              uint16_t *eob, const int16_t *scan,
                              const int16_t *iscan);
-typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t>
+typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
+                        int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
 
-class VP9QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
+// Wrapper for FP version which does not use zbin or quant_shift.
+typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
+                               int skip_block, const int16_t *round,
+                               const int16_t *quant, tran_low_t *qcoeff,
+                               tran_low_t *dqcoeff, const int16_t *dequant,
+                               uint16_t *eob, const int16_t *scan,
+                               const int16_t *iscan);
+
+template <QuantizeFPFunc fn>
+void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block,
+                    const int16_t *zbin, const int16_t *round,
+                    const int16_t *quant, const int16_t *quant_shift,
+                    tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                    const int16_t *dequant, uint16_t *eob, const int16_t *scan,
+                    const int16_t *iscan) {
+  (void)zbin;
+  (void)quant_shift;
+
+  fn(coeff, count, skip_block, round, quant, qcoeff, dqcoeff, dequant, eob,
+     scan, iscan);
+}
+
+class VP9QuantizeBase {
  public:
-  virtual ~VP9QuantizeTest() {}
-  virtual void SetUp() {
-    quantize_op_ = GET_PARAM(0);
-    ref_quantize_op_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
+  VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
+      : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp) {
+    max_value_ = (1 << bit_depth_) - 1;
+    zbin_ptr_ =
+        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
+    round_fp_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
+    quant_fp_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
+    round_ptr_ =
+        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
+    quant_ptr_ =
+        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_)));
+    quant_shift_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
+    dequant_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  vpx_bit_depth_t bit_depth_;
-  int mask_;
-  QuantizeFunc quantize_op_;
-  QuantizeFunc ref_quantize_op_;
-};
-
-class VP9Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {
- public:
-  virtual ~VP9Quantize32Test() {}
-  virtual void SetUp() {
-    quantize_op_ = GET_PARAM(0);
-    ref_quantize_op_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
+  ~VP9QuantizeBase() {
+    vpx_free(zbin_ptr_);
+    vpx_free(round_fp_ptr_);
+    vpx_free(quant_fp_ptr_);
+    vpx_free(round_ptr_);
+    vpx_free(quant_ptr_);
+    vpx_free(quant_shift_ptr_);
+    vpx_free(dequant_ptr_);
+    zbin_ptr_ = NULL;
+    round_fp_ptr_ = NULL;
+    quant_fp_ptr_ = NULL;
+    round_ptr_ = NULL;
+    quant_ptr_ = NULL;
+    quant_shift_ptr_ = NULL;
+    dequant_ptr_ = NULL;
+    libvpx_test::ClearSystemState();
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+ protected:
+  int16_t *zbin_ptr_;
+  int16_t *round_fp_ptr_;
+  int16_t *quant_fp_ptr_;
+  int16_t *round_ptr_;
+  int16_t *quant_ptr_;
+  int16_t *quant_shift_ptr_;
+  int16_t *dequant_ptr_;
+  const vpx_bit_depth_t bit_depth_;
+  int max_value_;
+  const int max_size_;
+  const bool is_fp_;
+};
+
+class VP9QuantizeTest : public VP9QuantizeBase,
+                        public ::testing::TestWithParam<QuantizeParam> {
+ public:
+  VP9QuantizeTest()
+      : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3), GET_PARAM(4)),
+        quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {}
 
  protected:
-  vpx_bit_depth_t bit_depth_;
-  int mask_;
-  QuantizeFunc quantize_op_;
-  QuantizeFunc ref_quantize_op_;
+  const QuantizeFunc quantize_op_;
+  const QuantizeFunc ref_quantize_op_;
 };
 
+// This quantizer compares the AC coefficients to the quantization step size to
+// determine if further multiplication operations are needed.
+// Based on vp9_quantize_fp_sse2().
+void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr, const int16_t *scan,
+                      const int16_t *iscan) {
+  int i, eob = -1;
+  const int thr = dequant_ptr[1] >> 1;
+  (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < n_coeffs; i += 16) {
+    int y;
+    int nzflag_cnt = 0;
+    int abs_coeff[16];
+    int coeff_sign[16];
+
+    // count nzflag for each row (16 tran_low_t)
+    for (y = 0; y < 16; ++y) {
+      const int rc = i + y;
+      const int coeff = coeff_ptr[rc];
+      coeff_sign[y] = (coeff >> 31);
+      abs_coeff[y] = (coeff ^ coeff_sign[y]) - coeff_sign[y];
+      // The first 16 are skipped in the sse2 code.  Do the same here to match.
+      if (i >= 16 && (abs_coeff[y] <= thr)) {
+        nzflag_cnt++;
+      }
+    }
+
+    for (y = 0; y < 16; ++y) {
+      const int rc = i + y;
+      // If all of the AC coeffs in a row has magnitude less than the
+      // quantization step_size/2, quantize to zero.
+      if (nzflag_cnt < 16) {
+        int tmp =
+            clamp(abs_coeff[y] + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+        tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+        qcoeff_ptr[rc] = (tmp ^ coeff_sign[y]) - coeff_sign[y];
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
+    }
+  }
+
+  // Scan for eob.
+  for (i = 0; i < n_coeffs; i++) {
+    // Use the scan order to find the correct eob.
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
+                          int16_t *quant, int16_t *quant_shift,
+                          int16_t *dequant, int16_t *round_fp,
+                          int16_t *quant_fp) {
+  // Max when q == 0.  Otherwise, it is 48 for Y and 42 for U/V.
+  const int max_qrounding_factor_fp = 64;
+
+  for (int j = 0; j < 2; j++) {
+    // The range is 4 to 1828 in the VP9 tables.
+    const int qlookup = rnd->RandRange(1825) + 4;
+    round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7;
+    quant_fp[j] = (1 << 16) / qlookup;
+
+    // Values determined by deconstructing vp9_init_quantizer().
+    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
+    // values or U/V values of any bit depth. This is because y_delta is not
+    // factored into the vp9_ac_quant() call.
+    zbin[j] = rnd->RandRange(1200);
+
+    // round may be up to 685 for Y values or 914 for U/V.
+    round[j] = rnd->RandRange(914);
+    // quant ranges from 1 to -32703
+    quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
+    // quant_shift goes up to 1 << 16.
+    quant_shift[j] = rnd->RandRange(16384);
+    // dequant maxes out at 1828 for all cases.
+    dequant[j] = rnd->RandRange(1828);
+  }
+  for (int j = 2; j < 8; j++) {
+    zbin[j] = zbin[1];
+    round_fp[j] = round_fp[1];
+    quant_fp[j] = quant_fp[1];
+    round[j] = round[1];
+    quant[j] = quant[1];
+    quant_shift[j] = quant_shift[1];
+    dequant[j] = dequant[1];
+  }
+}
+
 TEST_P(VP9QuantizeTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < number_of_iterations; ++i) {
-    const int skip_block = i == 0;
-    const TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
-    const TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    const int count = (4 << sz) * (4 << sz);  // 16, 64, 256
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = rnd.Rand16() & mask_;
-    }
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
-                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
-  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
-}
+  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
+  ASSERT_TRUE(coeff.Init());
+  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(qcoeff.Init());
+  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(dqcoeff.Init());
+  Buffer<tran_low_t> ref_qcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_qcoeff.Init());
+  Buffer<tran_low_t> ref_dqcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_dqcoeff.Init());
+  uint16_t eob, ref_eob;
 
-TEST_P(VP9Quantize32Test, OperationCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
   for (int i = 0; i < number_of_iterations; ++i) {
-    const int skip_block = i == 0;
-    const TX_SIZE sz = TX_32X32;
-    const TX_TYPE tx_type = (TX_TYPE)(i % 4);
+    // Test skip block for the first three iterations to catch all the different
+    // sizes.
+    const int skip_block = 0;
+    TX_SIZE sz;
+    if (max_size_ == 16) {
+      sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    } else {
+      sz = TX_32X32;
+    }
+    const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
     const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    const int count = (4 << sz) * (4 << sz);  // 1024
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = rnd.Rand16() & mask_;
-    }
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
+    const int count = (4 << sz) * (4 << sz);
+    coeff.Set(&rnd, -max_value_, max_value_);
+    GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                         quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                         quant_fp_ptr_);
+    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+                     q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
                      scan_order->scan, scan_order->iscan);
+
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+        coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
+        quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+        dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+
+    EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
+    EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
+
+    EXPECT_EQ(eob, ref_eob);
+
+    if (HasFailure()) {
+      printf("Failure on iteration %d.\n", i);
+      qcoeff.PrintDifference(ref_qcoeff);
+      dqcoeff.PrintDifference(ref_dqcoeff);
+      return;
     }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
   }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
 }
 
 TEST_P(VP9QuantizeTest, EOBCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
+  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
+  ASSERT_TRUE(coeff.Init());
+  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(qcoeff.Init());
+  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(dqcoeff.Init());
+  Buffer<tran_low_t> ref_qcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_qcoeff.Init());
+  Buffer<tran_low_t> ref_dqcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_dqcoeff.Init());
+  uint16_t eob, ref_eob;
+
   for (int i = 0; i < number_of_iterations; ++i) {
-    int skip_block = i == 0;
-    TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
-    TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
+    const int skip_block = 0;
+    TX_SIZE sz;
+    if (max_size_ == 16) {
+      sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    } else {
+      sz = TX_32X32;
+    }
+    const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
     const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    int count = (4 << sz) * (4 << sz);  // 16, 64, 256
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
+    int count = (4 << sz) * (4 << sz);
     // Two random entries
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = 0;
-    }
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
-
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
+    coeff.Set(0);
+    coeff.TopLeftPixel()[rnd(count)] =
+        static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
+    coeff.TopLeftPixel()[rnd(count)] =
+        static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
+    GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                         quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                         quant_fp_ptr_);
+    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+                     q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
                      scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
 
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    ASM_REGISTER_STATE_CHECK(quantize_op_(
+        coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
+        quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+        dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+
+    EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
+    EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
+
+    EXPECT_EQ(eob, ref_eob);
+
+    if (HasFailure()) {
+      printf("Failure on iteration %d.\n", i);
+      qcoeff.PrintDifference(ref_qcoeff);
+      dqcoeff.PrintDifference(ref_dqcoeff);
+      return;
     }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
   }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
 }
 
-TEST_P(VP9Quantize32Test, EOBCheck) {
+TEST_P(VP9QuantizeTest, DISABLED_Speed) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < number_of_iterations; ++i) {
-    int skip_block = i == 0;
-    TX_SIZE sz = TX_32X32;
-    TX_TYPE tx_type = (TX_TYPE)(i % 4);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    int count = (4 << sz) * (4 << sz);  // 1024
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = 0;
-    }
-    // Two random entries
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
+  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
+  ASSERT_TRUE(coeff.Init());
+  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(qcoeff.Init());
+  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(dqcoeff.Init());
+  uint16_t eob;
+  TX_SIZE starting_sz, ending_sz;
 
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
-                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
-
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
+  if (max_size_ == 16) {
+    starting_sz = TX_4X4;
+    ending_sz = TX_16X16;
+  } else {
+    starting_sz = TX_32X32;
+    ending_sz = TX_32X32;
+  }
+
+  for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
+    // zbin > coeff, zbin < coeff.
+    for (int i = 0; i < 2; ++i) {
+      const int skip_block = 0;
+      // TX_TYPE defines the scan order. That is not relevant to the speed test.
+      // Pick the first one.
+      const TX_TYPE tx_type = DCT_DCT;
+      const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+      const int count = (4 << sz) * (4 << sz);
+
+      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                           quant_fp_ptr_);
+      int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+      int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+
+      if (i == 0) {
+        // When |coeff values| are less than zbin the results are 0.
+        int threshold = 100;
+        if (max_size_ == 32) {
+          // For 32x32, the threshold is halved. Double it to keep the values
+          // from clearing it.
+          threshold = 200;
+        }
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
+        coeff.Set(&rnd, -99, 99);
+      } else if (i == 1) {
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
+        coeff.Set(&rnd, -500, 500);
+      }
+
+      vpx_usec_timer timer;
+      vpx_usec_timer_start(&timer);
+      for (int j = 0; j < 100000000 / count; ++j) {
+        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+                     q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(),
+                     dqcoeff.TopLeftPixel(), dequant_ptr_, &eob,
+                     scan_order->scan, scan_order->iscan);
+      }
+      vpx_usec_timer_mark(&timer);
+      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+      if (i == 0) printf("Bypass calculations.\n");
+      if (i == 1) printf("Full calculations.\n");
+      printf("Quantize %dx%d time: %5d ms\n", 4 << sz, 4 << sz,
+             elapsed_time / 1000);
+    }
+    printf("\n");
   }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
 }
+
 using std::tr1::make_tuple;
 
 #if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+// TODO(johannkoenig): Fix vpx_quantize_b_sse2 in highbitdepth builds.
+// make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8),
 INSTANTIATE_TEST_CASE_P(
     SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_highbd_quantize_b_sse2,
-                                 &vpx_highbd_quantize_b_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_quantize_b_sse2,
-                                 &vpx_highbd_quantize_b_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_quantize_b_sse2,
-                                 &vpx_highbd_quantize_b_c, VPX_BITS_12)));
+    ::testing::Values(
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+
+#else
 INSTANTIATE_TEST_CASE_P(
-    SSE2, VP9Quantize32Test,
-    ::testing::Values(make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                                 &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                                 &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                                 &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12)));
-#endif  // HAVE_SSE2
+    SSE2, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
+#if ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true)));
+#else
+INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest,
+                        ::testing::Values(make_tuple(&vpx_quantize_b_ssse3,
+                                                     &vpx_quantize_b_c,
+                                                     VPX_BITS_8, 16, false)));
+#endif
+
+#if ARCH_X86_64
+// TODO(johannkoenig): SSSE3 optimizations do not yet pass this test.
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_SSSE3, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_32x32_ssse3,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
+                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                                 VPX_BITS_8, 32, true)));
+#endif  // ARCH_X86_64
+#endif  // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
+
+// TODO(johannkoenig): AVX optimizations do not yet pass the 32x32 test or
+// highbitdepth configurations.
+#if HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    AVX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      // Even though SSSE3 and AVX do not match the reference
+                      // code, we can keep them in sync with each other.
+                      make_tuple(&vpx_quantize_b_32x32_avx,
+                                 &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32,
+                                 false)));
+#endif  // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH
+
+// TODO(webm:1448): dqcoeff is not handled correctly in HBD builds.
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_32x32_neon,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
+                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                                 VPX_BITS_8, 32, true)));
+#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
+
+// Only useful to compare "Speed" test results.
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_C, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
+        make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
+                   32, false),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
+                   true)));
 }  // namespace
diff --git a/libs/libvpx/test/vp9_scale_test.cc b/libs/libvpx/test/vp9_scale_test.cc
new file mode 100644
index 0000000000..5d7d38e89a
--- /dev/null
+++ b/libs/libvpx/test/vp9_scale_test.cc
@@ -0,0 +1,214 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/vpx_scale_test.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/yv12config.h"
+
+namespace libvpx_test {
+
+typedef void (*ScaleFrameFunc)(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst,
+                               INTERP_FILTER filter_type, int phase_scaler);
+
+class ScaleTest : public VpxScaleBase,
+                  public ::testing::TestWithParam<ScaleFrameFunc> {
+ public:
+  virtual ~ScaleTest() {}
+
+ protected:
+  virtual void SetUp() { scale_fn_ = GetParam(); }
+
+  void ReferenceScaleFrame(INTERP_FILTER filter_type, int phase_scaler) {
+    vp9_scale_and_extend_frame_c(&img_, &ref_img_, filter_type, phase_scaler);
+  }
+
+  void ScaleFrame(INTERP_FILTER filter_type, int phase_scaler) {
+    ASM_REGISTER_STATE_CHECK(
+        scale_fn_(&img_, &dst_img_, filter_type, phase_scaler));
+  }
+
+  void RunTest() {
+    static const int kNumSizesToTest = 20;
+    static const int kNumScaleFactorsToTest = 4;
+    static const int kSizesToTest[] = {
+      2,  4,  6,  8,  10, 12, 14, 16, 18,  20,
+      22, 24, 26, 28, 30, 32, 34, 68, 128, 134
+    };
+    static const int kScaleFactors[] = { 1, 2, 3, 4 };
+    for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) {
+      for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
+        for (int h = 0; h < kNumSizesToTest; ++h) {
+          const int src_height = kSizesToTest[h];
+          for (int w = 0; w < kNumSizesToTest; ++w) {
+            const int src_width = kSizesToTest[w];
+            for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest;
+                 ++sf_up_idx) {
+              const int sf_up = kScaleFactors[sf_up_idx];
+              for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
+                   ++sf_down_idx) {
+                const int sf_down = kScaleFactors[sf_down_idx];
+                const int dst_width = src_width * sf_up / sf_down;
+                const int dst_height = src_height * sf_up / sf_down;
+                if (sf_up == sf_down && sf_up != 1) {
+                  continue;
+                }
+                // I420 frame width and height must be even.
+                if (!dst_width || !dst_height || dst_width & 1 ||
+                    dst_height & 1) {
+                  continue;
+                }
+                // vpx_convolve8_c() has restriction on the step which cannot
+                // exceed 64 (ratio 1 to 4).
+                if (src_width > 4 * dst_width || src_height > 4 * dst_height) {
+                  continue;
+                }
+                ASSERT_NO_FATAL_FAILURE(ResetScaleImages(
+                    src_width, src_height, dst_width, dst_height));
+                ReferenceScaleFrame(filter_type, phase_scaler);
+                ScaleFrame(filter_type, phase_scaler);
+                if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
+                           ref_img_.frame_size)) {
+                  printf(
+                      "filter_type = %d, phase_scaler = %d, src_width = %4d, "
+                      "src_height = %4d, dst_width = %4d, dst_height = %4d, "
+                      "scale factor = %d:%d\n",
+                      filter_type, phase_scaler, src_width, src_height,
+                      dst_width, dst_height, sf_down, sf_up);
+                  PrintDiff();
+                }
+                CompareImages(dst_img_);
+                DeallocScaleImages();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void PrintDiffComponent(const uint8_t *const ref, const uint8_t *const opt,
+                          const int stride, const int width, const int height,
+                          const int plane_idx) const {
+    for (int y = 0; y < height; y++) {
+      for (int x = 0; x < width; x++) {
+        if (ref[y * stride + x] != opt[y * stride + x]) {
+          printf("Plane %d pixel[%d][%d] diff:%6d (ref),%6d (opt)\n", plane_idx,
+                 y, x, ref[y * stride + x], opt[y * stride + x]);
+          break;
+        }
+      }
+    }
+  }
+
+  void PrintDiff() const {
+    assert(ref_img_.y_stride == dst_img_.y_stride);
+    assert(ref_img_.y_width == dst_img_.y_width);
+    assert(ref_img_.y_height == dst_img_.y_height);
+    assert(ref_img_.uv_stride == dst_img_.uv_stride);
+    assert(ref_img_.uv_width == dst_img_.uv_width);
+    assert(ref_img_.uv_height == dst_img_.uv_height);
+
+    if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
+               ref_img_.frame_size)) {
+      PrintDiffComponent(ref_img_.y_buffer, dst_img_.y_buffer,
+                         ref_img_.y_stride, ref_img_.y_width, ref_img_.y_height,
+                         0);
+      PrintDiffComponent(ref_img_.u_buffer, dst_img_.u_buffer,
+                         ref_img_.uv_stride, ref_img_.uv_width,
+                         ref_img_.uv_height, 1);
+      PrintDiffComponent(ref_img_.v_buffer, dst_img_.v_buffer,
+                         ref_img_.uv_stride, ref_img_.uv_width,
+                         ref_img_.uv_height, 2);
+    }
+  }
+
+  ScaleFrameFunc scale_fn_;
+};
+
+TEST_P(ScaleTest, ScaleFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); }
+
+TEST_P(ScaleTest, DISABLED_Speed) {
+  static const int kCountSpeedTestBlock = 100;
+  static const int kNumScaleFactorsToTest = 4;
+  static const int kScaleFactors[] = { 1, 2, 3, 4 };
+  const int src_width = 1280;
+  const int src_height = 720;
+  for (INTERP_FILTER filter_type = 2; filter_type < 4; ++filter_type) {
+    for (int phase_scaler = 0; phase_scaler < 2; ++phase_scaler) {
+      for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; ++sf_up_idx) {
+        const int sf_up = kScaleFactors[sf_up_idx];
+        for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
+             ++sf_down_idx) {
+          const int sf_down = kScaleFactors[sf_down_idx];
+          const int dst_width = src_width * sf_up / sf_down;
+          const int dst_height = src_height * sf_up / sf_down;
+          if (sf_up == sf_down && sf_up != 1) {
+            continue;
+          }
+          // I420 frame width and height must be even.
+          if (dst_width & 1 || dst_height & 1) {
+            continue;
+          }
+          ASSERT_NO_FATAL_FAILURE(
+              ResetScaleImages(src_width, src_height, dst_width, dst_height));
+          ASM_REGISTER_STATE_CHECK(
+              ReferenceScaleFrame(filter_type, phase_scaler));
+
+          vpx_usec_timer timer;
+          vpx_usec_timer_start(&timer);
+          for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+            ScaleFrame(filter_type, phase_scaler);
+          }
+          libvpx_test::ClearSystemState();
+          vpx_usec_timer_mark(&timer);
+          const int elapsed_time =
+              static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+          CompareImages(dst_img_);
+          DeallocScaleImages();
+
+          printf(
+              "filter_type = %d, phase_scaler = %d, src_width = %4d, "
+              "src_height = %4d, dst_width = %4d, dst_height = %4d, "
+              "scale factor = %d:%d, scale time: %5d ms\n",
+              filter_type, phase_scaler, src_width, src_height, dst_width,
+              dst_height, sf_down, sf_up, elapsed_time);
+        }
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, ScaleTest,
+                        ::testing::Values(vp9_scale_and_extend_frame_c));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, ScaleTest,
+                        ::testing::Values(vp9_scale_and_extend_frame_ssse3));
+#endif  // HAVE_SSSE3
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, ScaleTest,
+                        ::testing::Values(vp9_scale_and_extend_frame_neon));
+#endif  // HAVE_NEON
+
+}  // namespace libvpx_test
diff --git a/libs/libvpx/test/vp9_skip_loopfilter_test.cc b/libs/libvpx/test/vp9_skip_loopfilter_test.cc
index e847bbddf3..d41a784a22 100644
--- a/libs/libvpx/test/vp9_skip_loopfilter_test.cc
+++ b/libs/libvpx/test/vp9_skip_loopfilter_test.cc
@@ -85,8 +85,8 @@ class SkipLoopFilterTest {
   // TODO(fgalligan): Move the MD5 testing code into another class.
   void OpenMd5File(const std::string &md5_file_name) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name);
-    ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: "
-                                   << md5_file_name;
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "MD5 file open failed. Filename: " << md5_file_name;
   }
 
   // Reads the next line of the MD5 file.
diff --git a/libs/libvpx/test/vp9_subtract_test.cc b/libs/libvpx/test/vp9_subtract_test.cc
index 19ed304315..62845ad615 100644
--- a/libs/libvpx/test/vp9_subtract_test.cc
+++ b/libs/libvpx/test/vp9_subtract_test.cc
@@ -101,4 +101,9 @@ INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest,
                         ::testing::Values(vpx_subtract_block_msa));
 #endif
 
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, VP9SubtractBlockTest,
+                        ::testing::Values(vpx_subtract_block_mmi));
+#endif
+
 }  // namespace vp9
diff --git a/libs/libvpx/test/vp9_thread_test.cc b/libs/libvpx/test/vp9_thread_test.cc
index 3e3fd25acb..576f5e906b 100644
--- a/libs/libvpx/test/vp9_thread_test.cc
+++ b/libs/libvpx/test/vp9_thread_test.cc
@@ -187,8 +187,8 @@ void DecodeFiles(const FileList files[]) {
   for (const FileList *iter = files; iter->name != NULL; ++iter) {
     SCOPED_TRACE(iter->name);
     for (int t = 1; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t)) << "threads = "
-                                                               << t;
+      EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t))
+          << "threads = " << t;
     }
   }
 }
diff --git a/libs/libvpx/test/vpx_scale_test.cc b/libs/libvpx/test/vpx_scale_test.cc
index 81773fe5b6..ac75dceb23 100644
--- a/libs/libvpx/test/vpx_scale_test.cc
+++ b/libs/libvpx/test/vpx_scale_test.cc
@@ -14,149 +14,17 @@
 #include "./vpx_scale_rtcd.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "test/vpx_scale_test.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
 #include "vpx_scale/yv12config.h"
 
-namespace {
+namespace libvpx_test {
 
 typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf);
 typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf,
                               YV12_BUFFER_CONFIG *dst_ybf);
 
-class VpxScaleBase {
- public:
-  virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); }
-
-  void ResetImage(int width, int height) {
-    width_ = width;
-    height_ = height;
-    memset(&img_, 0, sizeof(img_));
-    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_,
-                                             VP8BORDERINPIXELS));
-    memset(img_.buffer_alloc, kBufFiller, img_.frame_size);
-    FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
-              img_.y_stride);
-    FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
-              img_.uv_stride);
-    FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
-              img_.uv_stride);
-
-    memset(&ref_img_, 0, sizeof(ref_img_));
-    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_,
-                                             VP8BORDERINPIXELS));
-    memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size);
-
-    memset(&cpy_img_, 0, sizeof(cpy_img_));
-    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_,
-                                             VP8BORDERINPIXELS));
-    memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size);
-    ReferenceCopyFrame();
-  }
-
-  void DeallocImage() {
-    vp8_yv12_de_alloc_frame_buffer(&img_);
-    vp8_yv12_de_alloc_frame_buffer(&ref_img_);
-    vp8_yv12_de_alloc_frame_buffer(&cpy_img_);
-  }
-
- protected:
-  static const int kBufFiller = 123;
-  static const int kBufMax = kBufFiller - 1;
-
-  static void FillPlane(uint8_t *buf, int width, int height, int stride) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        buf[x + (y * stride)] = (x + (width * y)) % kBufMax;
-      }
-    }
-  }
-
-  static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
-                          int width, int height, int stride, int padding) {
-    // Copy the outermost visible pixel to a distance of at least 'padding.'
-    // The buffers are allocated such that there may be excess space outside the
-    // padding. As long as the minimum amount of padding is achieved it is not
-    // necessary to fill this space as well.
-    uint8_t *left = buf - padding;
-    uint8_t *right = buf + crop_width;
-    const int right_extend = padding + (width - crop_width);
-    const int bottom_extend = padding + (height - crop_height);
-
-    // Fill the border pixels from the nearest image pixel.
-    for (int y = 0; y < crop_height; ++y) {
-      memset(left, left[padding], padding);
-      memset(right, right[-1], right_extend);
-      left += stride;
-      right += stride;
-    }
-
-    left = buf - padding;
-    uint8_t *top = left - (stride * padding);
-    // The buffer does not always extend as far as the stride.
-    // Equivalent to padding + width + padding.
-    const int extend_width = padding + crop_width + right_extend;
-
-    // The first row was already extended to the left and right. Copy it up.
-    for (int y = 0; y < padding; ++y) {
-      memcpy(top, left, extend_width);
-      top += stride;
-    }
-
-    uint8_t *bottom = left + (crop_height * stride);
-    for (int y = 0; y < bottom_extend; ++y) {
-      memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
-      bottom += stride;
-    }
-  }
-
-  void ReferenceExtendBorder() {
-    ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width,
-                ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height,
-                ref_img_.y_stride, ref_img_.border);
-    ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width,
-                ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
-                ref_img_.uv_stride, ref_img_.border / 2);
-    ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width,
-                ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
-                ref_img_.uv_stride, ref_img_.border / 2);
-  }
-
-  void ReferenceCopyFrame() {
-    // Copy img_ to ref_img_ and extend frame borders. This will be used for
-    // verifying extend_fn_ as well as copy_frame_fn_.
-    EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
-    for (int y = 0; y < img_.y_crop_height; ++y) {
-      for (int x = 0; x < img_.y_crop_width; ++x) {
-        ref_img_.y_buffer[x + y * ref_img_.y_stride] =
-            img_.y_buffer[x + y * img_.y_stride];
-      }
-    }
-
-    for (int y = 0; y < img_.uv_crop_height; ++y) {
-      for (int x = 0; x < img_.uv_crop_width; ++x) {
-        ref_img_.u_buffer[x + y * ref_img_.uv_stride] =
-            img_.u_buffer[x + y * img_.uv_stride];
-        ref_img_.v_buffer[x + y * ref_img_.uv_stride] =
-            img_.v_buffer[x + y * img_.uv_stride];
-      }
-    }
-
-    ReferenceExtendBorder();
-  }
-
-  void CompareImages(const YV12_BUFFER_CONFIG actual) {
-    EXPECT_EQ(ref_img_.frame_size, actual.frame_size);
-    EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc,
-                        ref_img_.frame_size));
-  }
-
-  YV12_BUFFER_CONFIG img_;
-  YV12_BUFFER_CONFIG ref_img_;
-  YV12_BUFFER_CONFIG cpy_img_;
-  int width_;
-  int height_;
-};
-
 class ExtendBorderTest
     : public VpxScaleBase,
       public ::testing::TestWithParam<ExtendFrameBorderFunc> {
@@ -178,11 +46,11 @@ class ExtendBorderTest
     static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 };
     for (int h = 0; h < kNumSizesToTest; ++h) {
       for (int w = 0; w < kNumSizesToTest; ++w) {
-        ResetImage(kSizesToTest[w], kSizesToTest[h]);
+        ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h]));
+        ReferenceCopyFrame();
         ExtendBorder();
-        ReferenceExtendBorder();
         CompareImages(img_);
-        DeallocImage();
+        DeallocImages();
       }
     }
   }
@@ -204,7 +72,7 @@ class CopyFrameTest : public VpxScaleBase,
   virtual void SetUp() { copy_frame_fn_ = GetParam(); }
 
   void CopyFrame() {
-    ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &cpy_img_));
+    ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &dst_img_));
   }
 
   void RunTest() {
@@ -217,11 +85,11 @@ class CopyFrameTest : public VpxScaleBase,
     static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 };
     for (int h = 0; h < kNumSizesToTest; ++h) {
       for (int w = 0; w < kNumSizesToTest; ++w) {
-        ResetImage(kSizesToTest[w], kSizesToTest[h]);
+        ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h]));
         ReferenceCopyFrame();
         CopyFrame();
-        CompareImages(cpy_img_);
-        DeallocImage();
+        CompareImages(dst_img_);
+        DeallocImages();
       }
     }
   }
@@ -233,4 +101,5 @@ TEST_P(CopyFrameTest, CopyFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); }
 
 INSTANTIATE_TEST_CASE_P(C, CopyFrameTest,
                         ::testing::Values(vp8_yv12_copy_frame_c));
-}  // namespace
+
+}  // namespace libvpx_test
diff --git a/libs/libvpx/test/vpx_scale_test.h b/libs/libvpx/test/vpx_scale_test.h
new file mode 100644
index 0000000000..dcbd02b91f
--- /dev/null
+++ b/libs/libvpx/test/vpx_scale_test.h
@@ -0,0 +1,200 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_VPX_SCALE_TEST_H_
+#define TEST_VPX_SCALE_TEST_H_
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12config.h"
+
+using libvpx_test::ACMRandom;
+
+namespace libvpx_test {
+
+class VpxScaleBase {
+ public:
+  virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); }
+
+  void ResetImage(YV12_BUFFER_CONFIG *const img, const int width,
+                  const int height) {
+    memset(img, 0, sizeof(*img));
+    ASSERT_EQ(
+        0, vp8_yv12_alloc_frame_buffer(img, width, height, VP8BORDERINPIXELS));
+    memset(img->buffer_alloc, kBufFiller, img->frame_size);
+  }
+
+  void ResetImages(const int width, const int height) {
+    ResetImage(&img_, width, height);
+    ResetImage(&ref_img_, width, height);
+    ResetImage(&dst_img_, width, height);
+
+    FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
+              img_.y_stride);
+    FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
+              img_.uv_stride);
+    FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
+              img_.uv_stride);
+  }
+
+  void ResetScaleImage(YV12_BUFFER_CONFIG *const img, const int width,
+                       const int height) {
+    memset(img, 0, sizeof(*img));
+#if CONFIG_VP9_HIGHBITDEPTH
+    ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1, 0,
+                                        VP9_ENC_BORDER_IN_PIXELS, 0));
+#else
+    ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1,
+                                        VP9_ENC_BORDER_IN_PIXELS, 0));
+#endif
+    memset(img->buffer_alloc, kBufFiller, img->frame_size);
+  }
+
+  void ResetScaleImages(const int src_width, const int src_height,
+                        const int dst_width, const int dst_height) {
+    ResetScaleImage(&img_, src_width, src_height);
+    ResetScaleImage(&ref_img_, dst_width, dst_height);
+    ResetScaleImage(&dst_img_, dst_width, dst_height);
+    FillPlaneExtreme(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
+                     img_.y_stride);
+    FillPlaneExtreme(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
+                     img_.uv_stride);
+    FillPlaneExtreme(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
+                     img_.uv_stride);
+  }
+
+  void DeallocImages() {
+    vp8_yv12_de_alloc_frame_buffer(&img_);
+    vp8_yv12_de_alloc_frame_buffer(&ref_img_);
+    vp8_yv12_de_alloc_frame_buffer(&dst_img_);
+  }
+
+  void DeallocScaleImages() {
+    vpx_free_frame_buffer(&img_);
+    vpx_free_frame_buffer(&ref_img_);
+    vpx_free_frame_buffer(&dst_img_);
+  }
+
+ protected:
+  static const int kBufFiller = 123;
+  static const int kBufMax = kBufFiller - 1;
+
+  static void FillPlane(uint8_t *const buf, const int width, const int height,
+                        const int stride) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        buf[x + (y * stride)] = (x + (width * y)) % kBufMax;
+      }
+    }
+  }
+
+  static void FillPlaneExtreme(uint8_t *const buf, const int width,
+                               const int height, const int stride) {
+    ACMRandom rnd;
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        buf[x + (y * stride)] = rnd.Rand8() % 2 ? 255 : 0;
+      }
+    }
+  }
+
+  static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
+                          int width, int height, int stride, int padding) {
+    // Copy the outermost visible pixel to a distance of at least 'padding.'
+    // The buffers are allocated such that there may be excess space outside the
+    // padding. As long as the minimum amount of padding is achieved it is not
+    // necessary to fill this space as well.
+    uint8_t *left = buf - padding;
+    uint8_t *right = buf + crop_width;
+    const int right_extend = padding + (width - crop_width);
+    const int bottom_extend = padding + (height - crop_height);
+
+    // Fill the border pixels from the nearest image pixel.
+    for (int y = 0; y < crop_height; ++y) {
+      memset(left, left[padding], padding);
+      memset(right, right[-1], right_extend);
+      left += stride;
+      right += stride;
+    }
+
+    left = buf - padding;
+    uint8_t *top = left - (stride * padding);
+    // The buffer does not always extend as far as the stride.
+    // Equivalent to padding + width + padding.
+    const int extend_width = padding + crop_width + right_extend;
+
+    // The first row was already extended to the left and right. Copy it up.
+    for (int y = 0; y < padding; ++y) {
+      memcpy(top, left, extend_width);
+      top += stride;
+    }
+
+    uint8_t *bottom = left + (crop_height * stride);
+    for (int y = 0; y < bottom_extend; ++y) {
+      memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
+      bottom += stride;
+    }
+  }
+
+  void ReferenceExtendBorder() {
+    ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width,
+                ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height,
+                ref_img_.y_stride, ref_img_.border);
+    ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width,
+                ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
+                ref_img_.uv_stride, ref_img_.border / 2);
+    ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width,
+                ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
+                ref_img_.uv_stride, ref_img_.border / 2);
+  }
+
+  void ReferenceCopyFrame() {
+    // Copy img_ to ref_img_ and extend frame borders. This will be used for
+    // verifying extend_fn_ as well as copy_frame_fn_.
+    EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
+    for (int y = 0; y < img_.y_crop_height; ++y) {
+      for (int x = 0; x < img_.y_crop_width; ++x) {
+        ref_img_.y_buffer[x + y * ref_img_.y_stride] =
+            img_.y_buffer[x + y * img_.y_stride];
+      }
+    }
+
+    for (int y = 0; y < img_.uv_crop_height; ++y) {
+      for (int x = 0; x < img_.uv_crop_width; ++x) {
+        ref_img_.u_buffer[x + y * ref_img_.uv_stride] =
+            img_.u_buffer[x + y * img_.uv_stride];
+        ref_img_.v_buffer[x + y * ref_img_.uv_stride] =
+            img_.v_buffer[x + y * img_.uv_stride];
+      }
+    }
+
+    ReferenceExtendBorder();
+  }
+
+  void CompareImages(const YV12_BUFFER_CONFIG actual) {
+    EXPECT_EQ(ref_img_.frame_size, actual.frame_size);
+    EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc,
+                        ref_img_.frame_size));
+  }
+
+  YV12_BUFFER_CONFIG img_;
+  YV12_BUFFER_CONFIG ref_img_;
+  YV12_BUFFER_CONFIG dst_img_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_VPX_SCALE_TEST_H_
diff --git a/libs/libvpx/test/vpx_temporal_svc_encoder.sh b/libs/libvpx/test/vpx_temporal_svc_encoder.sh
index fcc8cb4ff4..56a7902f4f 100755
--- a/libs/libvpx/test/vpx_temporal_svc_encoder.sh
+++ b/libs/libvpx/test/vpx_temporal_svc_encoder.sh
@@ -40,6 +40,8 @@ vpx_tsvc_encoder() {
   local timebase_den="1000"
   local speed="6"
   local frame_drop_thresh="30"
+  local max_threads="4"
+  local error_resilient="1"
 
   shift 2
 
@@ -48,11 +50,22 @@ vpx_tsvc_encoder() {
     return 1
   fi
 
-  eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" \
-      "${codec}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
-      "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \
-      "$@" \
-      ${devnull}
+  # TODO(tomfinegan): Verify file output for all thread runs.
+  for threads in $(seq $max_threads); do
+    if [ "$(vpx_config_option_enabled CONFIG_VP9_HIGHBITDEPTH)" != "yes" ]; then
+      eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \
+        "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+        "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \
+        "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
+        "$@" ${devnull}
+    else
+      eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \
+        "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+        "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \
+        "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
+        "$@" "8" ${devnull}
+    fi
+  done
 }
 
 # Confirms that all expected output files exist given the output file name
@@ -72,193 +85,217 @@ files_exist() {
 
 vpx_tsvc_encoder_vp8_mode_0() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 0 200 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_0"
+    vpx_tsvc_encoder vp8 "${output_basename}" 0 200 || return 1
     # Mode 0 produces 1 stream
-    files_exist "${FUNCNAME}" 1 || return 1
+    files_exist "${output_basename}" 1 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_1() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 1 200 400 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_1"
+    vpx_tsvc_encoder vp8 "${output_basename}" 1 200 400 || return 1
     # Mode 1 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
+    files_exist "${output_basename}" 2 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_2() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 2 200 400 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_2"
+    vpx_tsvc_encoder vp8 "${output_basename}" 2 200 400 || return 1
     # Mode 2 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
+    files_exist "${output_basename}" 2 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_3() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 3 200 400 600 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_3"
+    vpx_tsvc_encoder vp8 "${output_basename}" 3 200 400 600 || return 1
     # Mode 3 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_4() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 4 200 400 600 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_4"
+    vpx_tsvc_encoder vp8 "${output_basename}" 4 200 400 600 || return 1
     # Mode 4 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_5() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 5 200 400 600 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_5"
+    vpx_tsvc_encoder vp8 "${output_basename}" 5 200 400 600 || return 1
     # Mode 5 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_6() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 6 200 400 600 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_6"
+    vpx_tsvc_encoder vp8 "${output_basename}" 6 200 400 600 || return 1
     # Mode 6 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_7() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 7 200 400 600 800 1000 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_7"
+    vpx_tsvc_encoder vp8 "${output_basename}" 7 200 400 600 800 1000 || return 1
     # Mode 7 produces 5 streams
-    files_exist "${FUNCNAME}" 5 || return 1
+    files_exist "${output_basename}" 5 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_8() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 8 200 400 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_8"
+    vpx_tsvc_encoder vp8 "${output_basename}" 8 200 400 || return 1
     # Mode 8 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
+    files_exist "${output_basename}" 2 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_9() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 9 200 400 600 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_9"
+    vpx_tsvc_encoder vp8 "${output_basename}" 9 200 400 600 || return 1
     # Mode 9 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_10() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 10 200 400 600 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_10"
+    vpx_tsvc_encoder vp8 "${output_basename}" 10 200 400 600 || return 1
     # Mode 10 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_11() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 11 200 400 600 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_11"
+    vpx_tsvc_encoder vp8 "${output_basename}" 11 200 400 600 || return 1
     # Mode 11 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_0() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 0 200 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_0"
+    vpx_tsvc_encoder vp9 "${output_basename}" 0 200 || return 1
     # Mode 0 produces 1 stream
-    files_exist "${FUNCNAME}" 1 || return 1
+    files_exist "${output_basename}" 1 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_1() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 1 200 400 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_1"
+    vpx_tsvc_encoder vp9 "${output_basename}" 1 200 400 || return 1
     # Mode 1 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
+    files_exist "${output_basename}" 2 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_2() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 2 200 400 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_2"
+    vpx_tsvc_encoder vp9 "${output_basename}" 2 200 400 || return 1
     # Mode 2 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
+    files_exist "${output_basename}" 2 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_3() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 3 200 400 600 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_3"
+    vpx_tsvc_encoder vp9 "${output_basename}" 3 200 400 600 || return 1
     # Mode 3 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_4() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 4 200 400 600 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_4"
+    vpx_tsvc_encoder vp9 "${output_basename}" 4 200 400 600 || return 1
     # Mode 4 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_5() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 5 200 400 600 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_5"
+    vpx_tsvc_encoder vp9 "${output_basename}" 5 200 400 600 || return 1
     # Mode 5 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_6() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 6 200 400 600 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_6"
+    vpx_tsvc_encoder vp9 "${output_basename}" 6 200 400 600 || return 1
     # Mode 6 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_7() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 7 200 400 600 800 1000 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_7"
+    vpx_tsvc_encoder vp9 "${output_basename}" 7 200 400 600 800 1000 || return 1
     # Mode 7 produces 5 streams
-    files_exist "${FUNCNAME}" 5 || return 1
+    files_exist "${output_basename}" 5 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_8() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 8 200 400 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_8"
+    vpx_tsvc_encoder vp9 "${output_basename}" 8 200 400 || return 1
     # Mode 8 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
+    files_exist "${output_basename}" 2 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_9() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 9 200 400 600 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_9"
+    vpx_tsvc_encoder vp9 "${output_basename}" 9 200 400 600 || return 1
     # Mode 9 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_10() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 10 200 400 600 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_10"
+    vpx_tsvc_encoder vp9 "${output_basename}" 10 200 400 600 || return 1
     # Mode 10 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_11() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 11 200 400 600 || return 1
+    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_11"
+    vpx_tsvc_encoder vp9 "${output_basename}" 11 200 400 600 || return 1
     # Mode 11 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
diff --git a/libs/libvpx/test/vpxenc.sh b/libs/libvpx/test/vpxenc.sh
index e8994992ae..0c160dafc0 100755
--- a/libs/libvpx/test/vpxenc.sh
+++ b/libs/libvpx/test/vpxenc.sh
@@ -90,6 +90,15 @@ vpxenc_rt_params() {
     --undershoot-pct=50"
 }
 
+# Forces --passes to 1 with CONFIG_REALTIME_ONLY.
+vpxenc_passes_param() {
+  if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" = "yes" ]; then
+    echo "--passes=1"
+  else
+    echo "--passes=2"
+  fi
+}
+
 # Wrapper function for running vpxenc with pipe input. Requires that
 # LIBVPX_BIN_PATH points to the directory containing vpxenc. $1 is used as the
 # input file path and shifted away. All remaining parameters are passed through
@@ -218,9 +227,11 @@ vpxenc_vp8_ivf_piped_input() {
 vpxenc_vp9_ivf() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf"
+    local readonly passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
+      "${passes}" \
       --ivf \
       --output="${output}"
 
@@ -235,9 +246,11 @@ vpxenc_vp9_webm() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
+    local readonly passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
+      "${passes}" \
       --output="${output}"
 
     if [ ! -e "${output}" ]; then
@@ -339,11 +352,13 @@ vpxenc_vp9_webm_2pass() {
 vpxenc_vp9_ivf_lossless() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf"
+    local readonly passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
       --output="${output}" \
+      "${passes}" \
       --lossless=1
 
     if [ ! -e "${output}" ]; then
@@ -356,11 +371,13 @@ vpxenc_vp9_ivf_lossless() {
 vpxenc_vp9_ivf_minq0_maxq0() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf"
+    local readonly passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
       --output="${output}" \
+      "${passes}" \
       --min-q=0 \
       --max-q=0
 
@@ -377,12 +394,13 @@ vpxenc_vp9_webm_lag10_frames20() {
     local readonly lag_total_frames=20
     local readonly lag_frames=10
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm"
+    local readonly passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${lag_total_frames}" \
       --lag-in-frames="${lag_frames}" \
       --output="${output}" \
-      --passes=2 \
+      "${passes}" \
       --auto-alt-ref=1
 
     if [ ! -e "${output}" ]; then
@@ -397,9 +415,11 @@ vpxenc_vp9_webm_non_square_par() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm"
+    local readonly passes=$(vpxenc_passes_param)
     vpxenc $(y4m_input_non_square_par) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
+      "${passes}" \
       --output="${output}"
 
     if [ ! -e "${output}" ]; then
@@ -412,18 +432,21 @@ vpxenc_vp9_webm_non_square_par() {
 vpxenc_tests="vpxenc_vp8_ivf
               vpxenc_vp8_webm
               vpxenc_vp8_webm_rt
-              vpxenc_vp8_webm_2pass
-              vpxenc_vp8_webm_lag10_frames20
               vpxenc_vp8_ivf_piped_input
               vpxenc_vp9_ivf
               vpxenc_vp9_webm
               vpxenc_vp9_webm_rt
               vpxenc_vp9_webm_rt_multithread_tiled
               vpxenc_vp9_webm_rt_multithread_tiled_frameparallel
-              vpxenc_vp9_webm_2pass
               vpxenc_vp9_ivf_lossless
               vpxenc_vp9_ivf_minq0_maxq0
               vpxenc_vp9_webm_lag10_frames20
               vpxenc_vp9_webm_non_square_par"
+if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then
+  vpxenc_tests="$vpxenc_tests
+                vpxenc_vp8_webm_2pass
+                vpxenc_vp8_webm_lag10_frames20
+                vpxenc_vp9_webm_2pass"
+fi
 
 run_tests vpxenc_verify_environment "${vpxenc_tests}"
diff --git a/libs/libvpx/test/webm_video_source.h b/libs/libvpx/test/webm_video_source.h
index 53713618ee..09c007a3f3 100644
--- a/libs/libvpx/test/webm_video_source.h
+++ b/libs/libvpx/test/webm_video_source.h
@@ -40,8 +40,8 @@ class WebMVideoSource : public CompressedVideoSource {
 
   virtual void Begin() {
     vpx_ctx_->file = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(vpx_ctx_->file != NULL) << "Input file open failed. Filename: "
-                                        << file_name_;
+    ASSERT_TRUE(vpx_ctx_->file != NULL)
+        << "Input file open failed. Filename: " << file_name_;
 
     ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM";
 
diff --git a/libs/libvpx/test/y4m_video_source.h b/libs/libvpx/test/y4m_video_source.h
index 2682ddde3d..1301f69703 100644
--- a/libs/libvpx/test/y4m_video_source.h
+++ b/libs/libvpx/test/y4m_video_source.h
@@ -34,8 +34,8 @@ class Y4mVideoSource : public VideoSource {
   virtual void OpenSource() {
     CloseSource();
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-                                     << file_name_;
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
   }
 
   virtual void ReadSourceToStart() {
diff --git a/libs/libvpx/test/yuv_video_source.h b/libs/libvpx/test/yuv_video_source.h
index 71ad2ab9a6..aee6b2ffbb 100644
--- a/libs/libvpx/test/yuv_video_source.h
+++ b/libs/libvpx/test/yuv_video_source.h
@@ -43,8 +43,8 @@ class YUVVideoSource : public VideoSource {
   virtual void Begin() {
     if (input_file_) fclose(input_file_);
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-                                     << file_name_;
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
     if (start_) {
       fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
     }
diff --git a/libs/libvpx/third_party/googletest/README.libvpx b/libs/libvpx/third_party/googletest/README.libvpx
index 0e3b8b9377..2cd6910b41 100644
--- a/libs/libvpx/third_party/googletest/README.libvpx
+++ b/libs/libvpx/third_party/googletest/README.libvpx
@@ -1,7 +1,7 @@
-URL: http://code.google.com/p/googletest/
-Version: 1.7.0
+URL: https://github.com/google/googletest
+Version: 1.8.0
 License: BSD
-License File: COPYING
+License File: LICENSE
 
 Description:
 Google's framework for writing C++ tests on a variety of platforms
@@ -12,10 +12,13 @@ failures, various options for running the tests, and XML test report
 generation.
 
 Local Modifications:
-- Removed unused declarations of kPathSeparatorString to have warning
-  free build.
-- Added GTEST_ATTRIBUTE_UNUSED_ to test registering dummies in TEST_P
-  and INSTANTIATE_TEST_CASE_P to remove warnings about unused variables
-  under GCC 5.
-- Only define g_in_fast_death_test_child for non-Windows builds; quiets an
-  unused variable warning.
+- Remove everything but:
+  googletest-release-1.8.0/googletest/
+   CHANGES
+   CONTRIBUTORS
+   include
+   LICENSE
+   README.md
+   src
+- Suppress unsigned overflow instrumentation in the LCG
+  https://github.com/google/googletest/pull/1066
diff --git a/libs/libvpx/third_party/googletest/src/README b/libs/libvpx/third_party/googletest/src/README
deleted file mode 100644
index 26f35a8479..0000000000
--- a/libs/libvpx/third_party/googletest/src/README
+++ /dev/null
@@ -1,435 +0,0 @@
-Google C++ Testing Framework
-============================
-
-http://code.google.com/p/googletest/
-
-Overview
---------
-
-Google's framework for writing C++ tests on a variety of platforms
-(Linux, Mac OS X, Windows, Windows CE, Symbian, etc).  Based on the
-xUnit architecture.  Supports automatic test discovery, a rich set of
-assertions, user-defined assertions, death tests, fatal and non-fatal
-failures, various options for running the tests, and XML test report
-generation.
-
-Please see the project page above for more information as well as the
-mailing list for questions, discussions, and development.  There is
-also an IRC channel on OFTC (irc.oftc.net) #gtest available.  Please
-join us!
-
-Requirements for End Users
---------------------------
-
-Google Test is designed to have fairly minimal requirements to build
-and use with your projects, but there are some.  Currently, we support
-Linux, Windows, Mac OS X, and Cygwin.  We will also make our best
-effort to support other platforms (e.g. Solaris, AIX, and z/OS).
-However, since core members of the Google Test project have no access
-to these platforms, Google Test may have outstanding issues there.  If
-you notice any problems on your platform, please notify
-googletestframework@googlegroups.com.  Patches for fixing them are
-even more welcome!
-
-### Linux Requirements ###
-
-These are the base requirements to build and use Google Test from a source
-package (as described below):
-  * GNU-compatible Make or gmake
-  * POSIX-standard shell
-  * POSIX(-2) Regular Expressions (regex.h)
-  * A C++98-standard-compliant compiler
-
-### Windows Requirements ###
-
-  * Microsoft Visual C++ 7.1 or newer
-
-### Cygwin Requirements ###
-
-  * Cygwin 1.5.25-14 or newer
-
-### Mac OS X Requirements ###
-
-  * Mac OS X 10.4 Tiger or newer
-  * Developer Tools Installed
-
-Also, you'll need CMake 2.6.4 or higher if you want to build the
-samples using the provided CMake script, regardless of the platform.
-
-Requirements for Contributors
------------------------------
-
-We welcome patches.  If you plan to contribute a patch, you need to
-build Google Test and its own tests from an SVN checkout (described
-below), which has further requirements:
-
-  * Python version 2.3 or newer (for running some of the tests and
-    re-generating certain source files from templates)
-  * CMake 2.6.4 or newer
-
-Getting the Source
-------------------
-
-There are two primary ways of getting Google Test's source code: you
-can download a stable source release in your preferred archive format,
-or directly check out the source from our Subversion (SVN) repositary.
-The SVN checkout requires a few extra steps and some extra software
-packages on your system, but lets you track the latest development and
-make patches much more easily, so we highly encourage it.
-
-### Source Package ###
-
-Google Test is released in versioned source packages which can be
-downloaded from the download page [1].  Several different archive
-formats are provided, but the only difference is the tools used to
-manipulate them, and the size of the resulting file.  Download
-whichever you are most comfortable with.
-
-  [1] http://code.google.com/p/googletest/downloads/list
-
-Once the package is downloaded, expand it using whichever tools you
-prefer for that type.  This will result in a new directory with the
-name "gtest-X.Y.Z" which contains all of the source code.  Here are
-some examples on Linux:
-
-  tar -xvzf gtest-X.Y.Z.tar.gz
-  tar -xvjf gtest-X.Y.Z.tar.bz2
-  unzip gtest-X.Y.Z.zip
-
-### SVN Checkout ###
-
-To check out the main branch (also known as the "trunk") of Google
-Test, run the following Subversion command:
-
-  svn checkout http://googletest.googlecode.com/svn/trunk/ gtest-svn
-
-Setting up the Build
---------------------
-
-To build Google Test and your tests that use it, you need to tell your
-build system where to find its headers and source files.  The exact
-way to do it depends on which build system you use, and is usually
-straightforward.
-
-### Generic Build Instructions ###
-
-Suppose you put Google Test in directory ${GTEST_DIR}.  To build it,
-create a library build target (or a project as called by Visual Studio
-and Xcode) to compile
-
-  ${GTEST_DIR}/src/gtest-all.cc
-
-with ${GTEST_DIR}/include in the system header search path and ${GTEST_DIR}
-in the normal header search path.  Assuming a Linux-like system and gcc,
-something like the following will do:
-
-  g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \
-      -pthread -c ${GTEST_DIR}/src/gtest-all.cc
-  ar -rv libgtest.a gtest-all.o
-
-(We need -pthread as Google Test uses threads.)
-
-Next, you should compile your test source file with
-${GTEST_DIR}/include in the system header search path, and link it
-with gtest and any other necessary libraries:
-
-  g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \
-      -o your_test
-
-As an example, the make/ directory contains a Makefile that you can
-use to build Google Test on systems where GNU make is available
-(e.g. Linux, Mac OS X, and Cygwin).  It doesn't try to build Google
-Test's own tests.  Instead, it just builds the Google Test library and
-a sample test.  You can use it as a starting point for your own build
-script.
-
-If the default settings are correct for your environment, the
-following commands should succeed:
-
-  cd ${GTEST_DIR}/make
-  make
-  ./sample1_unittest
-
-If you see errors, try to tweak the contents of make/Makefile to make
-them go away.  There are instructions in make/Makefile on how to do
-it.
-
-### Using CMake ###
-
-Google Test comes with a CMake build script (CMakeLists.txt) that can
-be used on a wide range of platforms ("C" stands for cross-platofrm.).
-If you don't have CMake installed already, you can download it for
-free from http://www.cmake.org/.
-
-CMake works by generating native makefiles or build projects that can
-be used in the compiler environment of your choice.  The typical
-workflow starts with:
-
-  mkdir mybuild       # Create a directory to hold the build output.
-  cd mybuild
-  cmake ${GTEST_DIR}  # Generate native build scripts.
-
-If you want to build Google Test's samples, you should replace the
-last command with
-
-  cmake -Dgtest_build_samples=ON ${GTEST_DIR}
-
-If you are on a *nix system, you should now see a Makefile in the
-current directory.  Just type 'make' to build gtest.
-
-If you use Windows and have Vistual Studio installed, a gtest.sln file
-and several .vcproj files will be created.  You can then build them
-using Visual Studio.
-
-On Mac OS X with Xcode installed, a .xcodeproj file will be generated.
-
-### Legacy Build Scripts ###
-
-Before settling on CMake, we have been providing hand-maintained build
-projects/scripts for Visual Studio, Xcode, and Autotools.  While we
-continue to provide them for convenience, they are not actively
-maintained any more.  We highly recommend that you follow the
-instructions in the previous two sections to integrate Google Test
-with your existing build system.
-
-If you still need to use the legacy build scripts, here's how:
-
-The msvc\ folder contains two solutions with Visual C++ projects.
-Open the gtest.sln or gtest-md.sln file using Visual Studio, and you
-are ready to build Google Test the same way you build any Visual
-Studio project.  Files that have names ending with -md use DLL
-versions of Microsoft runtime libraries (the /MD or the /MDd compiler
-option).  Files without that suffix use static versions of the runtime
-libraries (the /MT or the /MTd option).  Please note that one must use
-the same option to compile both gtest and the test code.  If you use
-Visual Studio 2005 or above, we recommend the -md version as /MD is
-the default for new projects in these versions of Visual Studio.
-
-On Mac OS X, open the gtest.xcodeproj in the xcode/ folder using
-Xcode.  Build the "gtest" target.  The universal binary framework will
-end up in your selected build directory (selected in the Xcode
-"Preferences..." -> "Building" pane and defaults to xcode/build).
-Alternatively, at the command line, enter:
-
-  xcodebuild
-
-This will build the "Release" configuration of gtest.framework in your
-default build location.  See the "xcodebuild" man page for more
-information about building different configurations and building in
-different locations.
-
-If you wish to use the Google Test Xcode project with Xcode 4.x and
-above, you need to either:
- * update the SDK configuration options in xcode/Config/General.xconfig.
-   Comment options SDKROOT, MACOS_DEPLOYMENT_TARGET, and GCC_VERSION. If
-   you choose this route you lose the ability to target earlier versions
-   of MacOS X.
- * Install an SDK for an earlier version. This doesn't appear to be
-   supported by Apple, but has been reported to work
-   (http://stackoverflow.com/questions/5378518).
-
-Tweaking Google Test
---------------------
-
-Google Test can be used in diverse environments.  The default
-configuration may not work (or may not work well) out of the box in
-some environments.  However, you can easily tweak Google Test by
-defining control macros on the compiler command line.  Generally,
-these macros are named like GTEST_XYZ and you define them to either 1
-or 0 to enable or disable a certain feature.
-
-We list the most frequently used macros below.  For a complete list,
-see file include/gtest/internal/gtest-port.h.
-
-### Choosing a TR1 Tuple Library ###
-
-Some Google Test features require the C++ Technical Report 1 (TR1)
-tuple library, which is not yet available with all compilers.  The
-good news is that Google Test implements a subset of TR1 tuple that's
-enough for its own need, and will automatically use this when the
-compiler doesn't provide TR1 tuple.
-
-Usually you don't need to care about which tuple library Google Test
-uses.  However, if your project already uses TR1 tuple, you need to
-tell Google Test to use the same TR1 tuple library the rest of your
-project uses, or the two tuple implementations will clash.  To do
-that, add
-
-  -DGTEST_USE_OWN_TR1_TUPLE=0
-
-to the compiler flags while compiling Google Test and your tests.  If
-you want to force Google Test to use its own tuple library, just add
-
-  -DGTEST_USE_OWN_TR1_TUPLE=1
-
-to the compiler flags instead.
-
-If you don't want Google Test to use tuple at all, add
-
-  -DGTEST_HAS_TR1_TUPLE=0
-
-and all features using tuple will be disabled.
-
-### Multi-threaded Tests ###
-
-Google Test is thread-safe where the pthread library is available.
-After #include "gtest/gtest.h", you can check the GTEST_IS_THREADSAFE
-macro to see whether this is the case (yes if the macro is #defined to
-1, no if it's undefined.).
-
-If Google Test doesn't correctly detect whether pthread is available
-in your environment, you can force it with
-
-  -DGTEST_HAS_PTHREAD=1
-
-or
-
-  -DGTEST_HAS_PTHREAD=0
-
-When Google Test uses pthread, you may need to add flags to your
-compiler and/or linker to select the pthread library, or you'll get
-link errors.  If you use the CMake script or the deprecated Autotools
-script, this is taken care of for you.  If you use your own build
-script, you'll need to read your compiler and linker's manual to
-figure out what flags to add.
-
-### As a Shared Library (DLL) ###
-
-Google Test is compact, so most users can build and link it as a
-static library for the simplicity.  You can choose to use Google Test
-as a shared library (known as a DLL on Windows) if you prefer.
-
-To compile *gtest* as a shared library, add
-
-  -DGTEST_CREATE_SHARED_LIBRARY=1
-
-to the compiler flags.  You'll also need to tell the linker to produce
-a shared library instead - consult your linker's manual for how to do
-it.
-
-To compile your *tests* that use the gtest shared library, add
-
-  -DGTEST_LINKED_AS_SHARED_LIBRARY=1
-
-to the compiler flags.
-
-Note: while the above steps aren't technically necessary today when
-using some compilers (e.g. GCC), they may become necessary in the
-future, if we decide to improve the speed of loading the library (see
-http://gcc.gnu.org/wiki/Visibility for details).  Therefore you are
-recommended to always add the above flags when using Google Test as a
-shared library.  Otherwise a future release of Google Test may break
-your build script.
-
-### Avoiding Macro Name Clashes ###
-
-In C++, macros don't obey namespaces.  Therefore two libraries that
-both define a macro of the same name will clash if you #include both
-definitions.  In case a Google Test macro clashes with another
-library, you can force Google Test to rename its macro to avoid the
-conflict.
-
-Specifically, if both Google Test and some other code define macro
-FOO, you can add
-
-  -DGTEST_DONT_DEFINE_FOO=1
-
-to the compiler flags to tell Google Test to change the macro's name
-from FOO to GTEST_FOO.  Currently FOO can be FAIL, SUCCEED, or TEST.
-For example, with -DGTEST_DONT_DEFINE_TEST=1, you'll need to write
-
-  GTEST_TEST(SomeTest, DoesThis) { ... }
-
-instead of
-
-  TEST(SomeTest, DoesThis) { ... }
-
-in order to define a test.
-
-Upgrating from an Earlier Version
----------------------------------
-
-We strive to keep Google Test releases backward compatible.
-Sometimes, though, we have to make some breaking changes for the
-users' long-term benefits.  This section describes what you'll need to
-do if you are upgrading from an earlier version of Google Test.
-
-### Upgrading from 1.3.0 or Earlier ###
-
-You may need to explicitly enable or disable Google Test's own TR1
-tuple library.  See the instructions in section "Choosing a TR1 Tuple
-Library".
-
-### Upgrading from 1.4.0 or Earlier ###
-
-The Autotools build script (configure + make) is no longer officially
-supportted.  You are encouraged to migrate to your own build system or
-use CMake.  If you still need to use Autotools, you can find
-instructions in the README file from Google Test 1.4.0.
-
-On platforms where the pthread library is available, Google Test uses
-it in order to be thread-safe.  See the "Multi-threaded Tests" section
-for what this means to your build script.
-
-If you use Microsoft Visual C++ 7.1 with exceptions disabled, Google
-Test will no longer compile.  This should affect very few people, as a
-large portion of STL (including <string>) doesn't compile in this mode
-anyway.  We decided to stop supporting it in order to greatly simplify
-Google Test's implementation.
-
-Developing Google Test
-----------------------
-
-This section discusses how to make your own changes to Google Test.
-
-### Testing Google Test Itself ###
-
-To make sure your changes work as intended and don't break existing
-functionality, you'll want to compile and run Google Test's own tests.
-For that you can use CMake:
-
-  mkdir mybuild
-  cd mybuild
-  cmake -Dgtest_build_tests=ON ${GTEST_DIR}
-
-Make sure you have Python installed, as some of Google Test's tests
-are written in Python.  If the cmake command complains about not being
-able to find Python ("Could NOT find PythonInterp (missing:
-PYTHON_EXECUTABLE)"), try telling it explicitly where your Python
-executable can be found:
-
-  cmake -DPYTHON_EXECUTABLE=path/to/python -Dgtest_build_tests=ON ${GTEST_DIR}
-
-Next, you can build Google Test and all of its own tests.  On *nix,
-this is usually done by 'make'.  To run the tests, do
-
-  make test
-
-All tests should pass.
-
-### Regenerating Source Files ###
-
-Some of Google Test's source files are generated from templates (not
-in the C++ sense) using a script.  A template file is named FOO.pump,
-where FOO is the name of the file it will generate.  For example, the
-file include/gtest/internal/gtest-type-util.h.pump is used to generate
-gtest-type-util.h in the same directory.
-
-Normally you don't need to worry about regenerating the source files,
-unless you need to modify them.  In that case, you should modify the
-corresponding .pump files instead and run the pump.py Python script to
-regenerate them.  You can find pump.py in the scripts/ directory.
-Read the Pump manual [2] for how to use it.
-
-  [2] http://code.google.com/p/googletest/wiki/PumpManual
-
-### Contributing a Patch ###
-
-We welcome patches.  Please read the Google Test developer's guide [3]
-for how you can contribute.  In particular, make sure you have signed
-the Contributor License Agreement, or we won't be able to accept the
-patch.
-
-  [3] http://code.google.com/p/googletest/wiki/GoogleTestDevGuide
-
-Happy testing!
diff --git a/libs/libvpx/third_party/googletest/src/README.md b/libs/libvpx/third_party/googletest/src/README.md
new file mode 100644
index 0000000000..edd4408054
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/README.md
@@ -0,0 +1,280 @@
+
+### Generic Build Instructions ###
+
+#### Setup ####
+
+To build Google Test and your tests that use it, you need to tell your
+build system where to find its headers and source files.  The exact
+way to do it depends on which build system you use, and is usually
+straightforward.
+
+#### Build ####
+
+Suppose you put Google Test in directory `${GTEST_DIR}`.  To build it,
+create a library build target (or a project as called by Visual Studio
+and Xcode) to compile
+
+    ${GTEST_DIR}/src/gtest-all.cc
+
+with `${GTEST_DIR}/include` in the system header search path and `${GTEST_DIR}`
+in the normal header search path.  Assuming a Linux-like system and gcc,
+something like the following will do:
+
+    g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \
+        -pthread -c ${GTEST_DIR}/src/gtest-all.cc
+    ar -rv libgtest.a gtest-all.o
+
+(We need `-pthread` as Google Test uses threads.)
+
+Next, you should compile your test source file with
+`${GTEST_DIR}/include` in the system header search path, and link it
+with gtest and any other necessary libraries:
+
+    g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \
+        -o your_test
+
+As an example, the make/ directory contains a Makefile that you can
+use to build Google Test on systems where GNU make is available
+(e.g. Linux, Mac OS X, and Cygwin).  It doesn't try to build Google
+Test's own tests.  Instead, it just builds the Google Test library and
+a sample test.  You can use it as a starting point for your own build
+script.
+
+If the default settings are correct for your environment, the
+following commands should succeed:
+
+    cd ${GTEST_DIR}/make
+    make
+    ./sample1_unittest
+
+If you see errors, try to tweak the contents of `make/Makefile` to make
+them go away.  There are instructions in `make/Makefile` on how to do
+it.
+
+### Using CMake ###
+
+Google Test comes with a CMake build script (
+[CMakeLists.txt](CMakeLists.txt)) that can be used on a wide range of platforms ("C" stands for
+cross-platform.). If you don't have CMake installed already, you can
+download it for free from <http://www.cmake.org/>.
+
+CMake works by generating native makefiles or build projects that can
+be used in the compiler environment of your choice.  The typical
+workflow starts with:
+
+    mkdir mybuild       # Create a directory to hold the build output.
+    cd mybuild
+    cmake ${GTEST_DIR}  # Generate native build scripts.
+
+If you want to build Google Test's samples, you should replace the
+last command with
+
+    cmake -Dgtest_build_samples=ON ${GTEST_DIR}
+
+If you are on a \*nix system, you should now see a Makefile in the
+current directory.  Just type 'make' to build gtest.
+
+If you use Windows and have Visual Studio installed, a `gtest.sln` file
+and several `.vcproj` files will be created.  You can then build them
+using Visual Studio.
+
+On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated.
+
+### Legacy Build Scripts ###
+
+Before settling on CMake, we have been providing hand-maintained build
+projects/scripts for Visual Studio, Xcode, and Autotools.  While we
+continue to provide them for convenience, they are not actively
+maintained any more.  We highly recommend that you follow the
+instructions in the previous two sections to integrate Google Test
+with your existing build system.
+
+If you still need to use the legacy build scripts, here's how:
+
+The msvc\ folder contains two solutions with Visual C++ projects.
+Open the `gtest.sln` or `gtest-md.sln` file using Visual Studio, and you
+are ready to build Google Test the same way you build any Visual
+Studio project.  Files that have names ending with -md use DLL
+versions of Microsoft runtime libraries (the /MD or the /MDd compiler
+option).  Files without that suffix use static versions of the runtime
+libraries (the /MT or the /MTd option).  Please note that one must use
+the same option to compile both gtest and the test code.  If you use
+Visual Studio 2005 or above, we recommend the -md version as /MD is
+the default for new projects in these versions of Visual Studio.
+
+On Mac OS X, open the `gtest.xcodeproj` in the `xcode/` folder using
+Xcode.  Build the "gtest" target.  The universal binary framework will
+end up in your selected build directory (selected in the Xcode
+"Preferences..." -> "Building" pane and defaults to xcode/build).
+Alternatively, at the command line, enter:
+
+    xcodebuild
+
+This will build the "Release" configuration of gtest.framework in your
+default build location.  See the "xcodebuild" man page for more
+information about building different configurations and building in
+different locations.
+
+If you wish to use the Google Test Xcode project with Xcode 4.x and
+above, you need to either:
+
+ * update the SDK configuration options in xcode/Config/General.xconfig.
+   Comment options `SDKROOT`, `MACOS_DEPLOYMENT_TARGET`, and `GCC_VERSION`. If
+   you choose this route you lose the ability to target earlier versions
+   of MacOS X.
+ * Install an SDK for an earlier version. This doesn't appear to be
+   supported by Apple, but has been reported to work
+   (http://stackoverflow.com/questions/5378518).
+
+### Tweaking Google Test ###
+
+Google Test can be used in diverse environments.  The default
+configuration may not work (or may not work well) out of the box in
+some environments.  However, you can easily tweak Google Test by
+defining control macros on the compiler command line.  Generally,
+these macros are named like `GTEST_XYZ` and you define them to either 1
+or 0 to enable or disable a certain feature.
+
+We list the most frequently used macros below.  For a complete list,
+see file [include/gtest/internal/gtest-port.h](include/gtest/internal/gtest-port.h).
+
+### Choosing a TR1 Tuple Library ###
+
+Some Google Test features require the C++ Technical Report 1 (TR1)
+tuple library, which is not yet available with all compilers.  The
+good news is that Google Test implements a subset of TR1 tuple that's
+enough for its own need, and will automatically use this when the
+compiler doesn't provide TR1 tuple.
+
+Usually you don't need to care about which tuple library Google Test
+uses.  However, if your project already uses TR1 tuple, you need to
+tell Google Test to use the same TR1 tuple library the rest of your
+project uses, or the two tuple implementations will clash.  To do
+that, add
+
+    -DGTEST_USE_OWN_TR1_TUPLE=0
+
+to the compiler flags while compiling Google Test and your tests.  If
+you want to force Google Test to use its own tuple library, just add
+
+    -DGTEST_USE_OWN_TR1_TUPLE=1
+
+to the compiler flags instead.
+
+If you don't want Google Test to use tuple at all, add
+
+    -DGTEST_HAS_TR1_TUPLE=0
+
+and all features using tuple will be disabled.
+
+### Multi-threaded Tests ###
+
+Google Test is thread-safe where the pthread library is available.
+After `#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE`
+macro to see whether this is the case (yes if the macro is `#defined` to
+1, no if it's undefined.).
+
+If Google Test doesn't correctly detect whether pthread is available
+in your environment, you can force it with
+
+    -DGTEST_HAS_PTHREAD=1
+
+or
+
+    -DGTEST_HAS_PTHREAD=0
+
+When Google Test uses pthread, you may need to add flags to your
+compiler and/or linker to select the pthread library, or you'll get
+link errors.  If you use the CMake script or the deprecated Autotools
+script, this is taken care of for you.  If you use your own build
+script, you'll need to read your compiler and linker's manual to
+figure out what flags to add.
+
+### As a Shared Library (DLL) ###
+
+Google Test is compact, so most users can build and link it as a
+static library for the simplicity.  You can choose to use Google Test
+as a shared library (known as a DLL on Windows) if you prefer.
+
+To compile *gtest* as a shared library, add
+
+    -DGTEST_CREATE_SHARED_LIBRARY=1
+
+to the compiler flags.  You'll also need to tell the linker to produce
+a shared library instead - consult your linker's manual for how to do
+it.
+
+To compile your *tests* that use the gtest shared library, add
+
+    -DGTEST_LINKED_AS_SHARED_LIBRARY=1
+
+to the compiler flags.
+
+Note: while the above steps aren't technically necessary today when
+using some compilers (e.g. GCC), they may become necessary in the
+future, if we decide to improve the speed of loading the library (see
+<http://gcc.gnu.org/wiki/Visibility> for details).  Therefore you are
+recommended to always add the above flags when using Google Test as a
+shared library.  Otherwise a future release of Google Test may break
+your build script.
+
+### Avoiding Macro Name Clashes ###
+
+In C++, macros don't obey namespaces.  Therefore two libraries that
+both define a macro of the same name will clash if you `#include` both
+definitions.  In case a Google Test macro clashes with another
+library, you can force Google Test to rename its macro to avoid the
+conflict.
+
+Specifically, if both Google Test and some other code define macro
+FOO, you can add
+
+    -DGTEST_DONT_DEFINE_FOO=1
+
+to the compiler flags to tell Google Test to change the macro's name
+from `FOO` to `GTEST_FOO`.  Currently `FOO` can be `FAIL`, `SUCCEED`,
+or `TEST`.  For example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll
+need to write
+
+    GTEST_TEST(SomeTest, DoesThis) { ... }
+
+instead of
+
+    TEST(SomeTest, DoesThis) { ... }
+
+in order to define a test.
+
+## Developing Google Test ##
+
+This section discusses how to make your own changes to Google Test.
+
+### Testing Google Test Itself ###
+
+To make sure your changes work as intended and don't break existing
+functionality, you'll want to compile and run Google Test's own tests.
+For that you can use CMake:
+
+    mkdir mybuild
+    cd mybuild
+    cmake -Dgtest_build_tests=ON ${GTEST_DIR}
+
+Make sure you have Python installed, as some of Google Test's tests
+are written in Python.  If the cmake command complains about not being
+able to find Python (`Could NOT find PythonInterp (missing:
+PYTHON_EXECUTABLE)`), try telling it explicitly where your Python
+executable can be found:
+
+    cmake -DPYTHON_EXECUTABLE=path/to/python -Dgtest_build_tests=ON ${GTEST_DIR}
+
+Next, you can build Google Test and all of its own tests.  On \*nix,
+this is usually done by 'make'.  To run the tests, do
+
+    make test
+
+All tests should pass.
+
+Normally you don't need to worry about regenerating the source files,
+unless you need to modify them.  In that case, you should modify the
+corresponding .pump files instead and run the pump.py Python script to
+regenerate them.  You can find pump.py in the [scripts/](scripts/) directory.
+Read the [Pump manual](docs/PumpManual.md) for how to use it.
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h
new file mode 100644
index 0000000000..957a69c6a9
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h
@@ -0,0 +1,294 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the public API for death tests.  It is
+// #included by gtest.h so a user doesn't need to include this
+// directly.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+
+#include "gtest/internal/gtest-death-test-internal.h"
+
+namespace testing {
+
+// This flag controls the style of death tests.  Valid values are "threadsafe",
+// meaning that the death test child process will re-execute the test binary
+// from the start, running only a single death test, or "fast",
+// meaning that the child process will execute the test logic immediately
+// after forking.
+GTEST_DECLARE_string_(death_test_style);
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+GTEST_API_ bool InDeathTestChild();
+
+}  // namespace internal
+
+// The following macros are useful for writing death tests.
+
+// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
+// executed:
+//
+//   1. It generates a warning if there is more than one active
+//   thread.  This is because it's safe to fork() or clone() only
+//   when there is a single thread.
+//
+//   2. The parent process clone()s a sub-process and runs the death
+//   test in it; the sub-process exits with code 0 at the end of the
+//   death test, if it hasn't exited already.
+//
+//   3. The parent process waits for the sub-process to terminate.
+//
+//   4. The parent process checks the exit code and error message of
+//   the sub-process.
+//
+// Examples:
+//
+//   ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
+//   for (int i = 0; i < 5; i++) {
+//     EXPECT_DEATH(server.ProcessRequest(i),
+//                  "Invalid request .* in ProcessRequest()")
+//                  << "Failed to die on request " << i;
+//   }
+//
+//   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
+//
+//   bool KilledBySIGHUP(int exit_code) {
+//     return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
+//   }
+//
+//   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
+//
+// On the regular expressions used in death tests:
+//
+//   On POSIX-compliant systems (*nix), we use the <regex.h> library,
+//   which uses the POSIX extended regex syntax.
+//
+//   On other platforms (e.g. Windows), we only support a simple regex
+//   syntax implemented as part of Google Test.  This limited
+//   implementation should be enough most of the time when writing
+//   death tests; though it lacks many features you can find in PCRE
+//   or POSIX extended regex syntax.  For example, we don't support
+//   union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
+//   repetition count ("x{5,7}"), among others.
+//
+//   Below is the syntax that we do support.  We chose it to be a
+//   subset of both PCRE and POSIX extended regex, so it's easy to
+//   learn wherever you come from.  In the following: 'A' denotes a
+//   literal character, period (.), or a single \\ escape sequence;
+//   'x' and 'y' denote regular expressions; 'm' and 'n' are for
+//   natural numbers.
+//
+//     c     matches any literal character c
+//     \\d   matches any decimal digit
+//     \\D   matches any character that's not a decimal digit
+//     \\f   matches \f
+//     \\n   matches \n
+//     \\r   matches \r
+//     \\s   matches any ASCII whitespace, including \n
+//     \\S   matches any character that's not a whitespace
+//     \\t   matches \t
+//     \\v   matches \v
+//     \\w   matches any letter, _, or decimal digit
+//     \\W   matches any character that \\w doesn't match
+//     \\c   matches any literal character c, which must be a punctuation
+//     .     matches any single character except \n
+//     A?    matches 0 or 1 occurrences of A
+//     A*    matches 0 or many occurrences of A
+//     A+    matches 1 or many occurrences of A
+//     ^     matches the beginning of a string (not that of each line)
+//     $     matches the end of a string (not that of each line)
+//     xy    matches x followed by y
+//
+//   If you accidentally use PCRE or POSIX extended regex features
+//   not implemented by us, you will get a run-time failure.  In that
+//   case, please try to rewrite your regular expression within the
+//   above syntax.
+//
+//   This implementation is *not* meant to be as highly tuned or robust
+//   as a compiled regex library, but should perform well enough for a
+//   death test, which already incurs significant overhead by launching
+//   a child process.
+//
+// Known caveats:
+//
+//   A "threadsafe" style death test obtains the path to the test
+//   program from argv[0] and re-executes it in the sub-process.  For
+//   simplicity, the current implementation doesn't search the PATH
+//   when launching the sub-process.  This means that the user must
+//   invoke the test program via a path that contains at least one
+//   path separator (e.g. path/to/foo_test and
+//   /absolute/path/to/bar_test are fine, but foo_test is not).  This
+//   is rarely a problem as people usually don't put the test binary
+//   directory in PATH.
+//
+// TODO(wan@google.com): make thread-safe death tests search the PATH.
+
+// Asserts that a given statement causes the program to exit, with an
+// integer exit status that satisfies predicate, and emitting error output
+// that matches regex.
+# define ASSERT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
+
+// Like ASSERT_EXIT, but continues on to successive tests in the
+// test case, if any:
+# define EXPECT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
+
+// Asserts that a given statement causes the program to exit, either by
+// explicitly exiting with a nonzero exit code or being killed by a
+// signal, and emitting error output that matches regex.
+# define ASSERT_DEATH(statement, regex) \
+    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Like ASSERT_DEATH, but continues on to successive tests in the
+// test case, if any:
+# define EXPECT_DEATH(statement, regex) \
+    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
+
+// Tests that an exit code describes a normal exit with a given exit code.
+class GTEST_API_ ExitedWithCode {
+ public:
+  explicit ExitedWithCode(int exit_code);
+  bool operator()(int exit_status) const;
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ExitedWithCode& other);
+
+  const int exit_code_;
+};
+
+# if !GTEST_OS_WINDOWS
+// Tests that an exit code describes an exit due to termination by a
+// given signal.
+class GTEST_API_ KilledBySignal {
+ public:
+  explicit KilledBySignal(int signum);
+  bool operator()(int exit_status) const;
+ private:
+  const int signum_;
+};
+# endif  // !GTEST_OS_WINDOWS
+
+// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
+// The death testing framework causes this to have interesting semantics,
+// since the sideeffects of the call are only visible in opt mode, and not
+// in debug mode.
+//
+// In practice, this can be used to test functions that utilize the
+// LOG(DFATAL) macro using the following style:
+//
+// int DieInDebugOr12(int* sideeffect) {
+//   if (sideeffect) {
+//     *sideeffect = 12;
+//   }
+//   LOG(DFATAL) << "death";
+//   return 12;
+// }
+//
+// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) {
+//   int sideeffect = 0;
+//   // Only asserts in dbg.
+//   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
+//
+// #ifdef NDEBUG
+//   // opt-mode has sideeffect visible.
+//   EXPECT_EQ(12, sideeffect);
+// #else
+//   // dbg-mode no visible sideeffect.
+//   EXPECT_EQ(0, sideeffect);
+// #endif
+// }
+//
+// This will assert that DieInDebugReturn12InOpt() crashes in debug
+// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
+// appropriate fallback value (12 in this case) in opt mode. If you
+// need to test that a function has appropriate side-effects in opt
+// mode, include assertions against the side-effects.  A general
+// pattern for this is:
+//
+// EXPECT_DEBUG_DEATH({
+//   // Side-effects here will have an effect after this statement in
+//   // opt mode, but none in debug mode.
+//   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
+// }, "death");
+//
+# ifdef NDEBUG
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+# else
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  ASSERT_DEATH(statement, regex)
+
+# endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // GTEST_HAS_DEATH_TEST
+
+// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
+// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
+// death tests are supported; otherwise they just issue a warning.  This is
+// useful when you are combining death test assertions with normal test
+// assertions in one test.
+#if GTEST_HAS_DEATH_TEST
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    EXPECT_DEATH(statement, regex)
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    ASSERT_DEATH(statement, regex)
+#else
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
+#endif
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-message.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-message.h
new file mode 100644
index 0000000000..fe879bca79
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-message.h
@@ -0,0 +1,250 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the Message class.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+
+#include <limits>
+
+#include "gtest/internal/gtest-port.h"
+
+// Ensures that there is at least one operator<< in the global namespace.
+// See Message& operator<<(...) below for why.
+void operator<<(const testing::internal::Secret&, int);
+
+namespace testing {
+
+// The Message class works like an ostream repeater.
+//
+// Typical usage:
+//
+//   1. You stream a bunch of values to a Message object.
+//      It will remember the text in a stringstream.
+//   2. Then you stream the Message object to an ostream.
+//      This causes the text in the Message to be streamed
+//      to the ostream.
+//
+// For example;
+//
+//   testing::Message foo;
+//   foo << 1 << " != " << 2;
+//   std::cout << foo;
+//
+// will print "1 != 2".
+//
+// Message is not intended to be inherited from.  In particular, its
+// destructor is not virtual.
+//
+// Note that stringstream behaves differently in gcc and in MSVC.  You
+// can stream a NULL char pointer to it in the former, but not in the
+// latter (it causes an access violation if you do).  The Message
+// class hides this difference by treating a NULL char pointer as
+// "(null)".
+class GTEST_API_ Message {
+ private:
+  // The type of basic IO manipulators (endl, ends, and flush) for
+  // narrow streams.
+  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
+
+ public:
+  // Constructs an empty Message.
+  Message();
+
+  // Copy constructor.
+  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
+    *ss_ << msg.GetString();
+  }
+
+  // Constructs a Message from a C-string.
+  explicit Message(const char* str) : ss_(new ::std::stringstream) {
+    *ss_ << str;
+  }
+
+#if GTEST_OS_SYMBIAN
+  // Streams a value (either a pointer or not) to this object.
+  template <typename T>
+  inline Message& operator <<(const T& value) {
+    StreamHelper(typename internal::is_pointer<T>::type(), value);
+    return *this;
+  }
+#else
+  // Streams a non-pointer value to this object.
+  template <typename T>
+  inline Message& operator <<(const T& val) {
+    // Some libraries overload << for STL containers.  These
+    // overloads are defined in the global namespace instead of ::std.
+    //
+    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
+    // overloads are visible in either the std namespace or the global
+    // namespace, but not other namespaces, including the testing
+    // namespace which Google Test's Message class is in.
+    //
+    // To allow STL containers (and other types that has a << operator
+    // defined in the global namespace) to be used in Google Test
+    // assertions, testing::Message must access the custom << operator
+    // from the global namespace.  With this using declaration,
+    // overloads of << defined in the global namespace and those
+    // visible via Koenig lookup are both exposed in this function.
+    using ::operator <<;
+    *ss_ << val;
+    return *this;
+  }
+
+  // Streams a pointer value to this object.
+  //
+  // This function is an overload of the previous one.  When you
+  // stream a pointer to a Message, this definition will be used as it
+  // is more specialized.  (The C++ Standard, section
+  // [temp.func.order].)  If you stream a non-pointer, then the
+  // previous definition will be used.
+  //
+  // The reason for this overload is that streaming a NULL pointer to
+  // ostream is undefined behavior.  Depending on the compiler, you
+  // may get "0", "(nil)", "(null)", or an access violation.  To
+  // ensure consistent result across compilers, we always treat NULL
+  // as "(null)".
+  template <typename T>
+  inline Message& operator <<(T* const& pointer) {  // NOLINT
+    if (pointer == NULL) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+    return *this;
+  }
+#endif  // GTEST_OS_SYMBIAN
+
+  // Since the basic IO manipulators are overloaded for both narrow
+  // and wide streams, we have to provide this specialized definition
+  // of operator <<, even though its body is the same as the
+  // templatized version above.  Without this definition, streaming
+  // endl or other basic IO manipulators to Message will confuse the
+  // compiler.
+  Message& operator <<(BasicNarrowIoManip val) {
+    *ss_ << val;
+    return *this;
+  }
+
+  // Instead of 1/0, we want to see true/false for bool values.
+  Message& operator <<(bool b) {
+    return *this << (b ? "true" : "false");
+  }
+
+  // These two overloads allow streaming a wide C string to a Message
+  // using the UTF-8 encoding.
+  Message& operator <<(const wchar_t* wide_c_str);
+  Message& operator <<(wchar_t* wide_c_str);
+
+#if GTEST_HAS_STD_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::std::wstring& wstr);
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::wstring& wstr);
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+  // Gets the text streamed to this object so far as an std::string.
+  // Each '\0' character in the buffer is replaced with "\\0".
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  std::string GetString() const;
+
+ private:
+
+#if GTEST_OS_SYMBIAN
+  // These are needed as the Nokia Symbian Compiler cannot decide between
+  // const T& and const T* in a function template. The Nokia compiler _can_
+  // decide between class template specializations for T and T*, so a
+  // tr1::type_traits-like is_pointer works, and we can overload on that.
+  template <typename T>
+  inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) {
+    if (pointer == NULL) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+  }
+  template <typename T>
+  inline void StreamHelper(internal::false_type /*is_pointer*/,
+                           const T& value) {
+    // See the comments in Message& operator <<(const T&) above for why
+    // we need this using statement.
+    using ::operator <<;
+    *ss_ << value;
+  }
+#endif  // GTEST_OS_SYMBIAN
+
+  // We'll hold the text streamed to this object here.
+  const internal::scoped_ptr< ::std::stringstream> ss_;
+
+  // We declare (but don't implement) this to prevent the compiler
+  // from implementing the assignment operator.
+  void operator=(const Message&);
+};
+
+// Streams a Message to an ostream.
+inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
+  return os << sb.GetString();
+}
+
+namespace internal {
+
+// Converts a streamable value to an std::string.  A NULL pointer is
+// converted to "(null)".  When the input value is a ::string,
+// ::std::string, ::wstring, or ::std::wstring object, each NUL
+// character in it is replaced with "\\0".
+template <typename T>
+std::string StreamableToString(const T& streamable) {
+  return (Message() << streamable).GetString();
+}
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h
new file mode 100644
index 0000000000..038f9ba79e
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h
@@ -0,0 +1,1444 @@
+// This file was GENERATED by command:
+//     pump.py gtest-param-test.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: vladl@google.com (Vlad Losev)
+//
+// Macros and functions for implementing parameterized tests
+// in Google C++ Testing Framework (Google Test)
+//
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+
+
+// Value-parameterized tests allow you to test your code with different
+// parameters without writing multiple copies of the same test.
+//
+// Here is how you use value-parameterized tests:
+
+#if 0
+
+// To write value-parameterized tests, first you should define a fixture
+// class. It is usually derived from testing::TestWithParam<T> (see below for
+// another inheritance scheme that's sometimes useful in more complicated
+// class hierarchies), where the type of your parameter values.
+// TestWithParam<T> is itself derived from testing::Test. T can be any
+// copyable type. If it's a raw pointer, you are responsible for managing the
+// lifespan of the pointed values.
+
+class FooTest : public ::testing::TestWithParam<const char*> {
+  // You can implement all the usual class fixture members here.
+};
+
+// Then, use the TEST_P macro to define as many parameterized tests
+// for this fixture as you want. The _P suffix is for "parameterized"
+// or "pattern", whichever you prefer to think.
+
+TEST_P(FooTest, DoesBlah) {
+  // Inside a test, access the test parameter with the GetParam() method
+  // of the TestWithParam<T> class:
+  EXPECT_TRUE(foo.Blah(GetParam()));
+  ...
+}
+
+TEST_P(FooTest, HasBlahBlah) {
+  ...
+}
+
+// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
+// case with any set of parameters you want. Google Test defines a number
+// of functions for generating test parameters. They return what we call
+// (surprise!) parameter generators. Here is a  summary of them, which
+// are all in the testing namespace:
+//
+//
+//  Range(begin, end [, step]) - Yields values {begin, begin+step,
+//                               begin+step+step, ...}. The values do not
+//                               include end. step defaults to 1.
+//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
+//  ValuesIn(container)        - Yields values from a C-style array, an STL
+//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
+//  Bool()                     - Yields sequence {false, true}.
+//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
+//                               for the math savvy) of the values generated
+//                               by the N generators.
+//
+// For more details, see comments at the definitions of these functions below
+// in this file.
+//
+// The following statement will instantiate tests from the FooTest test case
+// each with parameter values "meeny", "miny", and "moe".
+
+INSTANTIATE_TEST_CASE_P(InstantiationName,
+                        FooTest,
+                        Values("meeny", "miny", "moe"));
+
+// To distinguish different instances of the pattern, (yes, you
+// can instantiate it more then once) the first argument to the
+// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the
+// actual test case name. Remember to pick unique prefixes for different
+// instantiations. The tests from the instantiation above will have
+// these names:
+//
+//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
+//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
+//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
+//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
+//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
+//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
+//
+// You can use these names in --gtest_filter.
+//
+// This statement will instantiate all tests from FooTest again, each
+// with parameter values "cat" and "dog":
+
+const char* pets[] = {"cat", "dog"};
+INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
+
+// The tests from the instantiation above will have these names:
+//
+//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
+//
+// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests
+// in the given test case, whether their definitions come before or
+// AFTER the INSTANTIATE_TEST_CASE_P statement.
+//
+// Please also note that generator expressions (including parameters to the
+// generators) are evaluated in InitGoogleTest(), after main() has started.
+// This allows the user on one hand, to adjust generator parameters in order
+// to dynamically determine a set of tests to run and on the other hand,
+// give the user a chance to inspect the generated tests with Google Test
+// reflection API before RUN_ALL_TESTS() is executed.
+//
+// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
+// for more examples.
+//
+// In the future, we plan to publish the API for defining new parameter
+// generators. But for now this interface remains part of the internal
+// implementation and is subject to change.
+//
+//
+// A parameterized test fixture must be derived from testing::Test and from
+// testing::WithParamInterface<T>, where T is the type of the parameter
+// values. Inheriting from TestWithParam<T> satisfies that requirement because
+// TestWithParam<T> inherits from both Test and WithParamInterface. In more
+// complicated hierarchies, however, it is occasionally useful to inherit
+// separately from Test and WithParamInterface. For example:
+
+class BaseTest : public ::testing::Test {
+  // You can inherit all the usual members for a non-parameterized test
+  // fixture here.
+};
+
+class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
+  // The usual test fixture members go here too.
+};
+
+TEST_F(BaseTest, HasFoo) {
+  // This is an ordinary non-parameterized test.
+}
+
+TEST_P(DerivedTest, DoesBlah) {
+  // GetParam works just the same here as if you inherit from TestWithParam.
+  EXPECT_TRUE(foo.Blah(GetParam()));
+}
+
+#endif  // 0
+
+#include "gtest/internal/gtest-port.h"
+
+#if !GTEST_OS_SYMBIAN
+# include <utility>
+#endif
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-param-util.h"
+#include "gtest/internal/gtest-param-util-generated.h"
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Functions producing parameter generators.
+//
+// Google Test uses these generators to produce parameters for value-
+// parameterized tests. When a parameterized test case is instantiated
+// with a particular generator, Google Test creates and runs tests
+// for each element in the sequence produced by the generator.
+//
+// In the following sample, tests from test case FooTest are instantiated
+// each three times with parameter values 3, 5, and 8:
+//
+// class FooTest : public TestWithParam<int> { ... };
+//
+// TEST_P(FooTest, TestThis) {
+// }
+// TEST_P(FooTest, TestThat) {
+// }
+// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8));
+//
+
+// Range() returns generators providing sequences of values in a range.
+//
+// Synopsis:
+// Range(start, end)
+//   - returns a generator producing a sequence of values {start, start+1,
+//     start+2, ..., }.
+// Range(start, end, step)
+//   - returns a generator producing a sequence of values {start, start+step,
+//     start+step+step, ..., }.
+// Notes:
+//   * The generated sequences never include end. For example, Range(1, 5)
+//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
+//     returns a generator producing {1, 3, 5, 7}.
+//   * start and end must have the same type. That type may be any integral or
+//     floating-point type or a user defined type satisfying these conditions:
+//     * It must be assignable (have operator=() defined).
+//     * It must have operator+() (operator+(int-compatible type) for
+//       two-operand version).
+//     * It must have operator<() defined.
+//     Elements in the resulting sequences will also have that type.
+//   * Condition start < end must be satisfied in order for resulting sequences
+//     to contain any elements.
+//
+template <typename T, typename IncrementT>
+internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
+  return internal::ParamGenerator<T>(
+      new internal::RangeGenerator<T, IncrementT>(start, end, step));
+}
+
+template <typename T>
+internal::ParamGenerator<T> Range(T start, T end) {
+  return Range(start, end, 1);
+}
+
+// ValuesIn() function allows generation of tests with parameters coming from
+// a container.
+//
+// Synopsis:
+// ValuesIn(const T (&array)[N])
+//   - returns a generator producing sequences with elements from
+//     a C-style array.
+// ValuesIn(const Container& container)
+//   - returns a generator producing sequences with elements from
+//     an STL-style container.
+// ValuesIn(Iterator begin, Iterator end)
+//   - returns a generator producing sequences with elements from
+//     a range [begin, end) defined by a pair of STL-style iterators. These
+//     iterators can also be plain C pointers.
+//
+// Please note that ValuesIn copies the values from the containers
+// passed in and keeps them to generate tests in RUN_ALL_TESTS().
+//
+// Examples:
+//
+// This instantiates tests from test case StringTest
+// each with C-string values of "foo", "bar", and "baz":
+//
+// const char* strings[] = {"foo", "bar", "baz"};
+// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
+//
+// This instantiates tests from test case StlStringTest
+// each with STL strings with values "a" and "b":
+//
+// ::std::vector< ::std::string> GetParameterStrings() {
+//   ::std::vector< ::std::string> v;
+//   v.push_back("a");
+//   v.push_back("b");
+//   return v;
+// }
+//
+// INSTANTIATE_TEST_CASE_P(CharSequence,
+//                         StlStringTest,
+//                         ValuesIn(GetParameterStrings()));
+//
+//
+// This will also instantiate tests from CharTest
+// each with parameter values 'a' and 'b':
+//
+// ::std::list<char> GetParameterChars() {
+//   ::std::list<char> list;
+//   list.push_back('a');
+//   list.push_back('b');
+//   return list;
+// }
+// ::std::list<char> l = GetParameterChars();
+// INSTANTIATE_TEST_CASE_P(CharSequence2,
+//                         CharTest,
+//                         ValuesIn(l.begin(), l.end()));
+//
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end) {
+  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
+      ::value_type ParamType;
+  return internal::ParamGenerator<ParamType>(
+      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
+}
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
+  return ValuesIn(array, array + N);
+}
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container) {
+  return ValuesIn(container.begin(), container.end());
+}
+
+// Values() allows generating tests from explicitly specified list of
+// parameters.
+//
+// Synopsis:
+// Values(T v1, T v2, ..., T vN)
+//   - returns a generator producing sequences with elements v1, v2, ..., vN.
+//
+// For example, this instantiates tests from test case BarTest each
+// with values "one", "two", and "three":
+//
+// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three"));
+//
+// This instantiates tests from test case BazTest each with values 1, 2, 3.5.
+// The exact type of values will depend on the type of parameter in BazTest.
+//
+// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
+//
+// Currently, Values() supports from 1 to 50 parameters.
+//
+template <typename T1>
+internal::ValueArray1<T1> Values(T1 v1) {
+  return internal::ValueArray1<T1>(v1);
+}
+
+template <typename T1, typename T2>
+internal::ValueArray2<T1, T2> Values(T1 v1, T2 v2) {
+  return internal::ValueArray2<T1, T2>(v1, v2);
+}
+
+template <typename T1, typename T2, typename T3>
+internal::ValueArray3<T1, T2, T3> Values(T1 v1, T2 v2, T3 v3) {
+  return internal::ValueArray3<T1, T2, T3>(v1, v2, v3);
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+internal::ValueArray4<T1, T2, T3, T4> Values(T1 v1, T2 v2, T3 v3, T4 v4) {
+  return internal::ValueArray4<T1, T2, T3, T4>(v1, v2, v3, v4);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+internal::ValueArray5<T1, T2, T3, T4, T5> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5) {
+  return internal::ValueArray5<T1, T2, T3, T4, T5>(v1, v2, v3, v4, v5);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+internal::ValueArray6<T1, T2, T3, T4, T5, T6> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6) {
+  return internal::ValueArray6<T1, T2, T3, T4, T5, T6>(v1, v2, v3, v4, v5, v6);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7) {
+  return internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7>(v1, v2, v3, v4, v5,
+      v6, v7);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) {
+  return internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8>(v1, v2, v3, v4,
+      v5, v6, v7, v8);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) {
+  return internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) {
+  return internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
+    T11> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11) {
+  return internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
+      T11>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+    T12> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12) {
+  return internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+    T13> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13) {
+  return internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) {
+  return internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) {
+  return internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16) {
+  return internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17) {
+  return internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18) {
+  return internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) {
+  return internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) {
+  return internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) {
+  return internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22) {
+  return internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23) {
+  return internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24) {
+  return internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24>(v1, v2,
+      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+      v19, v20, v21, v22, v23, v24);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
+    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
+    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) {
+  return internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+      v18, v19, v20, v21, v22, v23, v24, v25);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+    T26> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26) {
+  return internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+    T27> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27) {
+  return internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
+      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+    T28> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28) {
+  return internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
+      v28);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29) {
+  return internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
+      v27, v28, v29);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) {
+  return internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
+      v26, v27, v28, v29, v30);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) {
+  return internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
+      v25, v26, v27, v28, v29, v30, v31);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32) {
+  return internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33) {
+  return internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
+    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
+    T31 v31, T32 v32, T33 v33, T34 v34) {
+  return internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
+      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) {
+  return internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) {
+  return internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37) {
+  return internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36, v37);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37, T38 v38) {
+  return internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38>(v1, v2,
+      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+      v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32,
+      v33, v34, v35, v36, v37, v38);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37, T38 v38, T39 v39) {
+  return internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+      v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+      v32, v33, v34, v35, v36, v37, v38, v39);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
+    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
+    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27,
+    T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35,
+    T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) {
+  return internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
+      v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+    T41> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) {
+  return internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
+      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28,
+      v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+    T42> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42) {
+  return internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
+      v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41,
+      v42);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+    T43> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42, T43 v43) {
+  return internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
+      v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40,
+      v41, v42, v43);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42, T43 v43, T44 v44) {
+  return internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
+      v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39,
+      v40, v41, v42, v43, v44);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
+    T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
+    T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) {
+  return internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
+      v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38,
+      v39, v40, v41, v42, v43, v44, v45);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) {
+  return internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
+      v38, v39, v40, v41, v42, v43, v44, v45, v46);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) {
+  return internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
+      v38, v39, v40, v41, v42, v43, v44, v45, v46, v47);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47,
+    T48 v48) {
+  return internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
+      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36,
+      v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
+    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
+    T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38,
+    T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46,
+    T47 v47, T48 v48, T49 v49) {
+  return internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35,
+      v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37,
+    T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45,
+    T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) {
+  return internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+      v48, v49, v50);
+}
+
+// Bool() allows generating tests with parameters in a set of (false, true).
+//
+// Synopsis:
+// Bool()
+//   - returns a generator producing sequences with elements {false, true}.
+//
+// It is useful when testing code that depends on Boolean flags. Combinations
+// of multiple flags can be tested when several Bool()'s are combined using
+// Combine() function.
+//
+// In the following example all tests in the test case FlagDependentTest
+// will be instantiated twice with parameters false and true.
+//
+// class FlagDependentTest : public testing::TestWithParam<bool> {
+//   virtual void SetUp() {
+//     external_flag = GetParam();
+//   }
+// }
+// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool());
+//
+inline internal::ParamGenerator<bool> Bool() {
+  return Values(false, true);
+}
+
+# if GTEST_HAS_COMBINE
+// Combine() allows the user to combine two or more sequences to produce
+// values of a Cartesian product of those sequences' elements.
+//
+// Synopsis:
+// Combine(gen1, gen2, ..., genN)
+//   - returns a generator producing sequences with elements coming from
+//     the Cartesian product of elements from the sequences generated by
+//     gen1, gen2, ..., genN. The sequence elements will have a type of
+//     tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
+//     of elements from sequences produces by gen1, gen2, ..., genN.
+//
+// Combine can have up to 10 arguments. This number is currently limited
+// by the maximum number of elements in the tuple implementation used by Google
+// Test.
+//
+// Example:
+//
+// This will instantiate tests in test case AnimalTest each one with
+// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
+// tuple("dog", BLACK), and tuple("dog", WHITE):
+//
+// enum Color { BLACK, GRAY, WHITE };
+// class AnimalTest
+//     : public testing::TestWithParam<tuple<const char*, Color> > {...};
+//
+// TEST_P(AnimalTest, AnimalLooksNice) {...}
+//
+// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest,
+//                         Combine(Values("cat", "dog"),
+//                                 Values(BLACK, WHITE)));
+//
+// This will instantiate tests in FlagDependentTest with all variations of two
+// Boolean flags:
+//
+// class FlagDependentTest
+//     : public testing::TestWithParam<tuple<bool, bool> > {
+//   virtual void SetUp() {
+//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
+//     tie(external_flag_1, external_flag_2) = GetParam();
+//   }
+// };
+//
+// TEST_P(FlagDependentTest, TestFeature1) {
+//   // Test your code using external_flag_1 and external_flag_2 here.
+// }
+// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest,
+//                         Combine(Bool(), Bool()));
+//
+template <typename Generator1, typename Generator2>
+internal::CartesianProductHolder2<Generator1, Generator2> Combine(
+    const Generator1& g1, const Generator2& g2) {
+  return internal::CartesianProductHolder2<Generator1, Generator2>(
+      g1, g2);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3>
+internal::CartesianProductHolder3<Generator1, Generator2, Generator3> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3) {
+  return internal::CartesianProductHolder3<Generator1, Generator2, Generator3>(
+      g1, g2, g3);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4>
+internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
+    Generator4> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4) {
+  return internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
+      Generator4>(
+      g1, g2, g3, g4);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5>
+internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
+    Generator4, Generator5> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5) {
+  return internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
+      Generator4, Generator5>(
+      g1, g2, g3, g4, g5);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6>
+internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6) {
+  return internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6>(
+      g1, g2, g3, g4, g5, g6);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7>
+internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7) {
+  return internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7>(
+      g1, g2, g3, g4, g5, g6, g7);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8>
+internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8) {
+  return internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8>(
+      g1, g2, g3, g4, g5, g6, g7, g8);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8, typename Generator9>
+internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8,
+    Generator9> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8, const Generator9& g9) {
+  return internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9>(
+      g1, g2, g3, g4, g5, g6, g7, g8, g9);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8, typename Generator9,
+    typename Generator10>
+internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
+    Generator10> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8, const Generator9& g9,
+        const Generator10& g10) {
+  return internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
+      Generator10>(
+      g1, g2, g3, g4, g5, g6, g7, g8, g9, g10);
+}
+# endif  // GTEST_HAS_COMBINE
+
+
+
+# define TEST_P(test_case_name, test_name) \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+      : public test_case_name { \
+   public: \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
+    virtual void TestBody(); \
+   private: \
+    static int AddToRegistry() { \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestPattern(\
+                      #test_case_name, \
+                      #test_name, \
+                      new ::testing::internal::TestMetaFactory< \
+                          GTEST_TEST_CLASS_NAME_(\
+                              test_case_name, test_name)>()); \
+      return 0; \
+    } \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
+  }; \
+  int GTEST_TEST_CLASS_NAME_(test_case_name, \
+                             test_name)::gtest_registering_dummy_ = \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
+  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user
+// to specify a function or functor that generates custom test name suffixes
+// based on the test parameters. The function should accept one argument of
+// type testing::TestParamInfo<class ParamType>, and return std::string.
+//
+// testing::PrintToStringParamName is a builtin test suffix generator that
+// returns the value of testing::PrintToString(GetParam()). It does not work
+// for std::string or C strings.
+//
+// Note: test names must be non-empty, unique, and may only contain ASCII
+// alphanumeric characters or underscore.
+
+# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \
+  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
+      gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
+  ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \
+      const ::testing::TestParamInfo<test_case_name::ParamType>& info) { \
+    return ::testing::internal::GetParamNameGen<test_case_name::ParamType> \
+        (__VA_ARGS__)(info); \
+  } \
+  int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestCaseInstantiation(\
+                      #prefix, \
+                      &gtest_##prefix##test_case_name##_EvalGenerator_, \
+                      &gtest_##prefix##test_case_name##_EvalGenerateName_, \
+                      __FILE__, __LINE__)
+
+}  // namespace testing
+
+#endif  // GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h.pump
new file mode 100644
index 0000000000..3078d6d2a1
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h.pump
@@ -0,0 +1,510 @@
+$$ -*- mode: c++; -*-
+$var n = 50  $$ Maximum length of Values arguments we want to support.
+$var maxtuple = 10  $$ Maximum number of Combine arguments we want to support.
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: vladl@google.com (Vlad Losev)
+//
+// Macros and functions for implementing parameterized tests
+// in Google C++ Testing Framework (Google Test)
+//
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+
+
+// Value-parameterized tests allow you to test your code with different
+// parameters without writing multiple copies of the same test.
+//
+// Here is how you use value-parameterized tests:
+
+#if 0
+
+// To write value-parameterized tests, first you should define a fixture
+// class. It is usually derived from testing::TestWithParam<T> (see below for
+// another inheritance scheme that's sometimes useful in more complicated
+// class hierarchies), where the type of your parameter values.
+// TestWithParam<T> is itself derived from testing::Test. T can be any
+// copyable type. If it's a raw pointer, you are responsible for managing the
+// lifespan of the pointed values.
+
+class FooTest : public ::testing::TestWithParam<const char*> {
+  // You can implement all the usual class fixture members here.
+};
+
+// Then, use the TEST_P macro to define as many parameterized tests
+// for this fixture as you want. The _P suffix is for "parameterized"
+// or "pattern", whichever you prefer to think.
+
+TEST_P(FooTest, DoesBlah) {
+  // Inside a test, access the test parameter with the GetParam() method
+  // of the TestWithParam<T> class:
+  EXPECT_TRUE(foo.Blah(GetParam()));
+  ...
+}
+
+TEST_P(FooTest, HasBlahBlah) {
+  ...
+}
+
+// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
+// case with any set of parameters you want. Google Test defines a number
+// of functions for generating test parameters. They return what we call
+// (surprise!) parameter generators. Here is a  summary of them, which
+// are all in the testing namespace:
+//
+//
+//  Range(begin, end [, step]) - Yields values {begin, begin+step,
+//                               begin+step+step, ...}. The values do not
+//                               include end. step defaults to 1.
+//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
+//  ValuesIn(container)        - Yields values from a C-style array, an STL
+//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
+//  Bool()                     - Yields sequence {false, true}.
+//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
+//                               for the math savvy) of the values generated
+//                               by the N generators.
+//
+// For more details, see comments at the definitions of these functions below
+// in this file.
+//
+// The following statement will instantiate tests from the FooTest test case
+// each with parameter values "meeny", "miny", and "moe".
+
+INSTANTIATE_TEST_CASE_P(InstantiationName,
+                        FooTest,
+                        Values("meeny", "miny", "moe"));
+
+// To distinguish different instances of the pattern, (yes, you
+// can instantiate it more then once) the first argument to the
+// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the
+// actual test case name. Remember to pick unique prefixes for different
+// instantiations. The tests from the instantiation above will have
+// these names:
+//
+//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
+//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
+//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
+//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
+//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
+//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
+//
+// You can use these names in --gtest_filter.
+//
+// This statement will instantiate all tests from FooTest again, each
+// with parameter values "cat" and "dog":
+
+const char* pets[] = {"cat", "dog"};
+INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
+
+// The tests from the instantiation above will have these names:
+//
+//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
+//
+// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests
+// in the given test case, whether their definitions come before or
+// AFTER the INSTANTIATE_TEST_CASE_P statement.
+//
+// Please also note that generator expressions (including parameters to the
+// generators) are evaluated in InitGoogleTest(), after main() has started.
+// This allows the user on one hand, to adjust generator parameters in order
+// to dynamically determine a set of tests to run and on the other hand,
+// give the user a chance to inspect the generated tests with Google Test
+// reflection API before RUN_ALL_TESTS() is executed.
+//
+// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
+// for more examples.
+//
+// In the future, we plan to publish the API for defining new parameter
+// generators. But for now this interface remains part of the internal
+// implementation and is subject to change.
+//
+//
+// A parameterized test fixture must be derived from testing::Test and from
+// testing::WithParamInterface<T>, where T is the type of the parameter
+// values. Inheriting from TestWithParam<T> satisfies that requirement because
+// TestWithParam<T> inherits from both Test and WithParamInterface. In more
+// complicated hierarchies, however, it is occasionally useful to inherit
+// separately from Test and WithParamInterface. For example:
+
+class BaseTest : public ::testing::Test {
+  // You can inherit all the usual members for a non-parameterized test
+  // fixture here.
+};
+
+class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
+  // The usual test fixture members go here too.
+};
+
+TEST_F(BaseTest, HasFoo) {
+  // This is an ordinary non-parameterized test.
+}
+
+TEST_P(DerivedTest, DoesBlah) {
+  // GetParam works just the same here as if you inherit from TestWithParam.
+  EXPECT_TRUE(foo.Blah(GetParam()));
+}
+
+#endif  // 0
+
+#include "gtest/internal/gtest-port.h"
+
+#if !GTEST_OS_SYMBIAN
+# include <utility>
+#endif
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-param-util.h"
+#include "gtest/internal/gtest-param-util-generated.h"
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Functions producing parameter generators.
+//
+// Google Test uses these generators to produce parameters for value-
+// parameterized tests. When a parameterized test case is instantiated
+// with a particular generator, Google Test creates and runs tests
+// for each element in the sequence produced by the generator.
+//
+// In the following sample, tests from test case FooTest are instantiated
+// each three times with parameter values 3, 5, and 8:
+//
+// class FooTest : public TestWithParam<int> { ... };
+//
+// TEST_P(FooTest, TestThis) {
+// }
+// TEST_P(FooTest, TestThat) {
+// }
+// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8));
+//
+
+// Range() returns generators providing sequences of values in a range.
+//
+// Synopsis:
+// Range(start, end)
+//   - returns a generator producing a sequence of values {start, start+1,
+//     start+2, ..., }.
+// Range(start, end, step)
+//   - returns a generator producing a sequence of values {start, start+step,
+//     start+step+step, ..., }.
+// Notes:
+//   * The generated sequences never include end. For example, Range(1, 5)
+//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
+//     returns a generator producing {1, 3, 5, 7}.
+//   * start and end must have the same type. That type may be any integral or
+//     floating-point type or a user defined type satisfying these conditions:
+//     * It must be assignable (have operator=() defined).
+//     * It must have operator+() (operator+(int-compatible type) for
+//       two-operand version).
+//     * It must have operator<() defined.
+//     Elements in the resulting sequences will also have that type.
+//   * Condition start < end must be satisfied in order for resulting sequences
+//     to contain any elements.
+//
+template <typename T, typename IncrementT>
+internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
+  return internal::ParamGenerator<T>(
+      new internal::RangeGenerator<T, IncrementT>(start, end, step));
+}
+
+template <typename T>
+internal::ParamGenerator<T> Range(T start, T end) {
+  return Range(start, end, 1);
+}
+
+// ValuesIn() function allows generation of tests with parameters coming from
+// a container.
+//
+// Synopsis:
+// ValuesIn(const T (&array)[N])
+//   - returns a generator producing sequences with elements from
+//     a C-style array.
+// ValuesIn(const Container& container)
+//   - returns a generator producing sequences with elements from
+//     an STL-style container.
+// ValuesIn(Iterator begin, Iterator end)
+//   - returns a generator producing sequences with elements from
+//     a range [begin, end) defined by a pair of STL-style iterators. These
+//     iterators can also be plain C pointers.
+//
+// Please note that ValuesIn copies the values from the containers
+// passed in and keeps them to generate tests in RUN_ALL_TESTS().
+//
+// Examples:
+//
+// This instantiates tests from test case StringTest
+// each with C-string values of "foo", "bar", and "baz":
+//
+// const char* strings[] = {"foo", "bar", "baz"};
+// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
+//
+// This instantiates tests from test case StlStringTest
+// each with STL strings with values "a" and "b":
+//
+// ::std::vector< ::std::string> GetParameterStrings() {
+//   ::std::vector< ::std::string> v;
+//   v.push_back("a");
+//   v.push_back("b");
+//   return v;
+// }
+//
+// INSTANTIATE_TEST_CASE_P(CharSequence,
+//                         StlStringTest,
+//                         ValuesIn(GetParameterStrings()));
+//
+//
+// This will also instantiate tests from CharTest
+// each with parameter values 'a' and 'b':
+//
+// ::std::list<char> GetParameterChars() {
+//   ::std::list<char> list;
+//   list.push_back('a');
+//   list.push_back('b');
+//   return list;
+// }
+// ::std::list<char> l = GetParameterChars();
+// INSTANTIATE_TEST_CASE_P(CharSequence2,
+//                         CharTest,
+//                         ValuesIn(l.begin(), l.end()));
+//
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end) {
+  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
+      ::value_type ParamType;
+  return internal::ParamGenerator<ParamType>(
+      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
+}
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
+  return ValuesIn(array, array + N);
+}
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container) {
+  return ValuesIn(container.begin(), container.end());
+}
+
+// Values() allows generating tests from explicitly specified list of
+// parameters.
+//
+// Synopsis:
+// Values(T v1, T v2, ..., T vN)
+//   - returns a generator producing sequences with elements v1, v2, ..., vN.
+//
+// For example, this instantiates tests from test case BarTest each
+// with values "one", "two", and "three":
+//
+// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three"));
+//
+// This instantiates tests from test case BazTest each with values 1, 2, 3.5.
+// The exact type of values will depend on the type of parameter in BazTest.
+//
+// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
+//
+// Currently, Values() supports from 1 to $n parameters.
+//
+$range i 1..n
+$for i [[
+$range j 1..i
+
+template <$for j, [[typename T$j]]>
+internal::ValueArray$i<$for j, [[T$j]]> Values($for j, [[T$j v$j]]) {
+  return internal::ValueArray$i<$for j, [[T$j]]>($for j, [[v$j]]);
+}
+
+]]
+
+// Bool() allows generating tests with parameters in a set of (false, true).
+//
+// Synopsis:
+// Bool()
+//   - returns a generator producing sequences with elements {false, true}.
+//
+// It is useful when testing code that depends on Boolean flags. Combinations
+// of multiple flags can be tested when several Bool()'s are combined using
+// Combine() function.
+//
+// In the following example all tests in the test case FlagDependentTest
+// will be instantiated twice with parameters false and true.
+//
+// class FlagDependentTest : public testing::TestWithParam<bool> {
+//   virtual void SetUp() {
+//     external_flag = GetParam();
+//   }
+// }
+// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool());
+//
+inline internal::ParamGenerator<bool> Bool() {
+  return Values(false, true);
+}
+
+# if GTEST_HAS_COMBINE
+// Combine() allows the user to combine two or more sequences to produce
+// values of a Cartesian product of those sequences' elements.
+//
+// Synopsis:
+// Combine(gen1, gen2, ..., genN)
+//   - returns a generator producing sequences with elements coming from
+//     the Cartesian product of elements from the sequences generated by
+//     gen1, gen2, ..., genN. The sequence elements will have a type of
+//     tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
+//     of elements from sequences produces by gen1, gen2, ..., genN.
+//
+// Combine can have up to $maxtuple arguments. This number is currently limited
+// by the maximum number of elements in the tuple implementation used by Google
+// Test.
+//
+// Example:
+//
+// This will instantiate tests in test case AnimalTest each one with
+// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
+// tuple("dog", BLACK), and tuple("dog", WHITE):
+//
+// enum Color { BLACK, GRAY, WHITE };
+// class AnimalTest
+//     : public testing::TestWithParam<tuple<const char*, Color> > {...};
+//
+// TEST_P(AnimalTest, AnimalLooksNice) {...}
+//
+// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest,
+//                         Combine(Values("cat", "dog"),
+//                                 Values(BLACK, WHITE)));
+//
+// This will instantiate tests in FlagDependentTest with all variations of two
+// Boolean flags:
+//
+// class FlagDependentTest
+//     : public testing::TestWithParam<tuple<bool, bool> > {
+//   virtual void SetUp() {
+//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
+//     tie(external_flag_1, external_flag_2) = GetParam();
+//   }
+// };
+//
+// TEST_P(FlagDependentTest, TestFeature1) {
+//   // Test your code using external_flag_1 and external_flag_2 here.
+// }
+// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest,
+//                         Combine(Bool(), Bool()));
+//
+$range i 2..maxtuple
+$for i [[
+$range j 1..i
+
+template <$for j, [[typename Generator$j]]>
+internal::CartesianProductHolder$i<$for j, [[Generator$j]]> Combine(
+    $for j, [[const Generator$j& g$j]]) {
+  return internal::CartesianProductHolder$i<$for j, [[Generator$j]]>(
+      $for j, [[g$j]]);
+}
+
+]]
+# endif  // GTEST_HAS_COMBINE
+
+
+
+# define TEST_P(test_case_name, test_name) \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+      : public test_case_name { \
+   public: \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
+    virtual void TestBody(); \
+   private: \
+    static int AddToRegistry() { \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestPattern(\
+                      #test_case_name, \
+                      #test_name, \
+                      new ::testing::internal::TestMetaFactory< \
+                          GTEST_TEST_CLASS_NAME_(\
+                              test_case_name, test_name)>()); \
+      return 0; \
+    } \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
+  }; \
+  int GTEST_TEST_CLASS_NAME_(test_case_name, \
+                             test_name)::gtest_registering_dummy_ = \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
+  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user
+// to specify a function or functor that generates custom test name suffixes
+// based on the test parameters. The function should accept one argument of
+// type testing::TestParamInfo<class ParamType>, and return std::string.
+//
+// testing::PrintToStringParamName is a builtin test suffix generator that
+// returns the value of testing::PrintToString(GetParam()).
+//
+// Note: test names must be non-empty, unique, and may only contain ASCII
+// alphanumeric characters or underscore. Because PrintToString adds quotes
+// to std::string and C strings, it won't work for these types.
+
+# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \
+  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
+      gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
+  ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \
+      const ::testing::TestParamInfo<test_case_name::ParamType>& info) { \
+    return ::testing::internal::GetParamNameGen<test_case_name::ParamType> \
+        (__VA_ARGS__)(info); \
+  } \
+  int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestCaseInstantiation(\
+                      #prefix, \
+                      &gtest_##prefix##test_case_name##_EvalGenerator_, \
+                      &gtest_##prefix##test_case_name##_EvalGenerateName_, \
+                      __FILE__, __LINE__)
+
+}  // namespace testing
+
+#endif  // GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h
new file mode 100644
index 0000000000..8a33164cb3
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h
@@ -0,0 +1,993 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Test - The Google C++ Testing Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// A user can teach this function how to print a class type T by
+// defining either operator<<() or PrintTo() in the namespace that
+// defines T.  More specifically, the FIRST defined function in the
+// following list will be used (assuming T is defined in namespace
+// foo):
+//
+//   1. foo::PrintTo(const T&, ostream*)
+//   2. operator<<(ostream&, const T&) defined in either foo or the
+//      global namespace.
+//
+// If none of the above is defined, it will print the debug string of
+// the value if it is a protocol buffer, or print the raw bytes in the
+// value otherwise.
+//
+// To aid debugging: when T is a reference type, the address of the
+// value is also printed; when T is a (const) char pointer, both the
+// pointer value and the NUL-terminated string it points to are
+// printed.
+//
+// We also provide some convenient wrappers:
+//
+//   // Prints a value to a string.  For a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   std::string ::testing::PrintToString(const T& value);
+//
+//   // Prints a value tersely: for a reference type, the referenced
+//   // value (but not the address) is printed; for a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
+//
+//   // Prints value using the type inferred by the compiler.  The difference
+//   // from UniversalTersePrint() is that this function prints both the
+//   // pointer and the NUL-terminated string for a (const or not) char pointer.
+//   void ::testing::internal::UniversalPrint(const T& value, ostream*);
+//
+//   // Prints the fields of a tuple tersely to a string vector, one
+//   // element for each field. Tuple support must be enabled in
+//   // gtest-port.h.
+//   std::vector<string> UniversalTersePrintTupleFieldsToStrings(
+//       const Tuple& value);
+//
+// Known limitation:
+//
+// The print primitives print the elements of an STL-style container
+// using the compiler-inferred type of *iter where iter is a
+// const_iterator of the container.  When const_iterator is an input
+// iterator but not a forward iterator, this inferred type may not
+// match value_type, and the print output may be incorrect.  In
+// practice, this is rarely a problem as for most containers
+// const_iterator is a forward iterator.  We'll fix this if there's an
+// actual need for it.  Note that this fix cannot rely on value_type
+// being defined as many user-defined container types don't have
+// value_type.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "gtest/internal/gtest-port.h"
+#include "gtest/internal/gtest-internal.h"
+
+#if GTEST_HAS_STD_TUPLE_
+# include <tuple>
+#endif
+
+namespace testing {
+
+// Definitions in the 'internal' and 'internal2' name spaces are
+// subject to change without notice.  DO NOT USE THEM IN USER CODE!
+namespace internal2 {
+
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
+                                     size_t count,
+                                     ::std::ostream* os);
+
+// For selecting which printer to use when a given type has neither <<
+// nor PrintTo().
+enum TypeKind {
+  kProtobuf,              // a protobuf type
+  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
+                          // (e.g. a named or unnamed enum type)
+  kOtherType              // anything else
+};
+
+// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
+// by the universal printer to print a value of type T when neither
+// operator<< nor PrintTo() is defined for T, where kTypeKind is the
+// "kind" of T as defined by enum TypeKind.
+template <typename T, TypeKind kTypeKind>
+class TypeWithoutFormatter {
+ public:
+  // This default version is called when kTypeKind is kOtherType.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    PrintBytesInObjectTo(reinterpret_cast<const unsigned char*>(&value),
+                         sizeof(value), os);
+  }
+};
+
+// We print a protobuf using its ShortDebugString() when the string
+// doesn't exceed this many characters; otherwise we print it using
+// DebugString() for better readability.
+const size_t kProtobufOneLinerMaxLength = 50;
+
+template <typename T>
+class TypeWithoutFormatter<T, kProtobuf> {
+ public:
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    const ::testing::internal::string short_str = value.ShortDebugString();
+    const ::testing::internal::string pretty_str =
+        short_str.length() <= kProtobufOneLinerMaxLength ?
+        short_str : ("\n" + value.DebugString());
+    *os << ("<" + pretty_str + ">");
+  }
+};
+
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToInteger> {
+ public:
+  // Since T has no << operator or PrintTo() but can be implicitly
+  // converted to BiggestInt, we print it as a BiggestInt.
+  //
+  // Most likely T is an enum type (either named or unnamed), in which
+  // case printing it as an integer is the desired behavior.  In case
+  // T is not an enum, printing it as an integer is the best we can do
+  // given that it has no user-defined printer.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    const internal::BiggestInt kBigInt = value;
+    *os << kBigInt;
+  }
+};
+
+// Prints the given value to the given ostream.  If the value is a
+// protocol message, its debug string is printed; if it's an enum or
+// of a type implicitly convertible to BiggestInt, it's printed as an
+// integer; otherwise the bytes in the value are printed.  This is
+// what UniversalPrinter<T>::Print() does when it knows nothing about
+// type T and T has neither << operator nor PrintTo().
+//
+// A user can override this behavior for a class type Foo by defining
+// a << operator in the namespace where Foo is defined.
+//
+// We put this operator in namespace 'internal2' instead of 'internal'
+// to simplify the implementation, as much code in 'internal' needs to
+// use << in STL, which would conflict with our own << were it defined
+// in 'internal'.
+//
+// Note that this operator<< takes a generic std::basic_ostream<Char,
+// CharTraits> type instead of the more restricted std::ostream.  If
+// we define it to take an std::ostream instead, we'll get an
+// "ambiguous overloads" compiler error when trying to print a type
+// Foo that supports streaming to std::basic_ostream<Char,
+// CharTraits>, as the compiler cannot tell whether
+// operator<<(std::ostream&, const T&) or
+// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
+// specific.
+template <typename Char, typename CharTraits, typename T>
+::std::basic_ostream<Char, CharTraits>& operator<<(
+    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
+  TypeWithoutFormatter<T,
+      (internal::IsAProtocolMessage<T>::value ? kProtobuf :
+       internal::ImplicitlyConvertible<const T&, internal::BiggestInt>::value ?
+       kConvertibleToInteger : kOtherType)>::PrintValue(x, &os);
+  return os;
+}
+
+}  // namespace internal2
+}  // namespace testing
+
+// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
+// magic needed for implementing UniversalPrinter won't work.
+namespace testing_internal {
+
+// Used to print a value that is not an STL-style container when the
+// user doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
+  // With the following statement, during unqualified name lookup,
+  // testing::internal2::operator<< appears as if it was declared in
+  // the nearest enclosing namespace that contains both
+  // ::testing_internal and ::testing::internal2, i.e. the global
+  // namespace.  For more details, refer to the C++ Standard section
+  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
+  // testing::internal2::operator<< in case T doesn't come with a <<
+  // operator.
+  //
+  // We cannot write 'using ::testing::internal2::operator<<;', which
+  // gcc 3.3 fails to compile due to a compiler bug.
+  using namespace ::testing::internal2;  // NOLINT
+
+  // Assuming T is defined in namespace foo, in the next statement,
+  // the compiler will consider all of:
+  //
+  //   1. foo::operator<< (thanks to Koenig look-up),
+  //   2. ::operator<< (as the current namespace is enclosed in ::),
+  //   3. testing::internal2::operator<< (thanks to the using statement above).
+  //
+  // The operator<< whose type matches T best will be picked.
+  //
+  // We deliberately allow #2 to be a candidate, as sometimes it's
+  // impossible to define #1 (e.g. when foo is ::std, defining
+  // anything in it is undefined behavior unless you are a compiler
+  // vendor.).
+  *os << value;
+}
+
+}  // namespace testing_internal
+
+namespace testing {
+namespace internal {
+
+// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
+// value of type ToPrint that is an operand of a comparison assertion
+// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
+// the comparison, and is used to help determine the best way to
+// format the value.  In particular, when the value is a C string
+// (char pointer) and the other operand is an STL string object, we
+// want to format the C string as a string, since we know it is
+// compared by value with the string object.  If the value is a char
+// pointer but the other operand is not an STL string object, we don't
+// know whether the pointer is supposed to point to a NUL-terminated
+// string, and thus want to print it as a pointer to be safe.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// The default case.
+template <typename ToPrint, typename OtherOperand>
+class FormatForComparison {
+ public:
+  static ::std::string Format(const ToPrint& value) {
+    return ::testing::PrintToString(value);
+  }
+};
+
+// Array.
+template <typename ToPrint, size_t N, typename OtherOperand>
+class FormatForComparison<ToPrint[N], OtherOperand> {
+ public:
+  static ::std::string Format(const ToPrint* value) {
+    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  }
+};
+
+// By default, print C string as pointers to be safe, as we don't know
+// whether they actually point to a NUL-terminated string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
+  template <typename OtherOperand>                                      \
+  class FormatForComparison<CharType*, OtherOperand> {                  \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(static_cast<const void*>(value)); \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
+
+// If a C string is compared with an STL string object, we know it's meant
+// to point to a NUL-terminated string, and thus can print it as a string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
+  template <>                                                           \
+  class FormatForComparison<CharType*, OtherStringType> {               \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(value);                           \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
+#endif
+
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
+#endif
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
+#endif
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
+
+// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
+// operand to be used in a failure message.  The type (but not value)
+// of the other operand may affect the format.  This allows us to
+// print a char* as a raw pointer when it is compared against another
+// char* or void*, and print it as a C string when it is compared
+// against an std::string object, for example.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename T1, typename T2>
+std::string FormatForComparisonFailureMessage(
+    const T1& value, const T2& /* other_operand */) {
+  return FormatForComparison<T1, T2>::Format(value);
+}
+
+// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
+// value to the given ostream.  The caller must ensure that
+// 'ostream_ptr' is not NULL, or the behavior is undefined.
+//
+// We define UniversalPrinter as a class template (as opposed to a
+// function template), as we need to partially specialize it for
+// reference types, which cannot be done with function templates.
+template <typename T>
+class UniversalPrinter;
+
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os);
+
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+template <typename C>
+void DefaultPrintTo(IsContainer /* dummy */,
+                    false_type /* is not a pointer */,
+                    const C& container, ::std::ostream* os) {
+  const size_t kMaxCount = 32;  // The maximum number of elements to print.
+  *os << '{';
+  size_t count = 0;
+  for (typename C::const_iterator it = container.begin();
+       it != container.end(); ++it, ++count) {
+    if (count > 0) {
+      *os << ',';
+      if (count == kMaxCount) {  // Enough has been printed.
+        *os << " ...";
+        break;
+      }
+    }
+    *os << ' ';
+    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
+    // handle *it being a native array.
+    internal::UniversalPrint(*it, os);
+  }
+
+  if (count > 0) {
+    *os << ' ';
+  }
+  *os << '}';
+}
+
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it.  (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space.  Their representation is
+// implementation-defined.  Therefore they will be printed as raw
+// bytes.)
+template <typename T>
+void DefaultPrintTo(IsNotContainer /* dummy */,
+                    true_type /* is a pointer */,
+                    T* p, ::std::ostream* os) {
+  if (p == NULL) {
+    *os << "NULL";
+  } else {
+    // C++ doesn't allow casting from a function pointer to any object
+    // pointer.
+    //
+    // IsTrue() silences warnings: "Condition is always true",
+    // "unreachable code".
+    if (IsTrue(ImplicitlyConvertible<T*, const void*>::value)) {
+      // T is not a function type.  We just call << to print p,
+      // relying on ADL to pick up user-defined << for their pointer
+      // types, if any.
+      *os << p;
+    } else {
+      // T is a function type, so '*os << p' doesn't do what we want
+      // (it just prints p as bool).  We want to print p as a const
+      // void*.  However, we cannot cast it to const void* directly,
+      // even using reinterpret_cast, as earlier versions of gcc
+      // (e.g. 3.4.5) cannot compile the cast when p is a function
+      // pointer.  Casting to UInt64 first solves the problem.
+      *os << reinterpret_cast<const void*>(
+          reinterpret_cast<internal::UInt64>(p));
+    }
+  }
+}
+
+// Used to print a non-container, non-pointer value when the user
+// doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintTo(IsNotContainer /* dummy */,
+                    false_type /* is not a pointer */,
+                    const T& value, ::std::ostream* os) {
+  ::testing_internal::DefaultPrintNonContainerTo(value, os);
+}
+
+// Prints the given value using the << operator if it has one;
+// otherwise prints the bytes in it.  This is what
+// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
+// or overloaded for type T.
+//
+// A user can override this behavior for a class type Foo by defining
+// an overload of PrintTo() in the namespace where Foo is defined.  We
+// give the user this option as sometimes defining a << operator for
+// Foo is not desirable (e.g. the coding style may prevent doing it,
+// or there is already a << operator but it doesn't do what the user
+// wants).
+template <typename T>
+void PrintTo(const T& value, ::std::ostream* os) {
+  // DefaultPrintTo() is overloaded.  The type of its first two
+  // arguments determine which version will be picked.  If T is an
+  // STL-style container, the version for container will be called; if
+  // T is a pointer, the pointer version will be called; otherwise the
+  // generic version will be called.
+  //
+  // Note that we check for container types here, prior to we check
+  // for protocol message types in our operator<<.  The rationale is:
+  //
+  // For protocol messages, we want to give people a chance to
+  // override Google Mock's format by defining a PrintTo() or
+  // operator<<.  For STL containers, other formats can be
+  // incompatible with Google Mock's format for the container
+  // elements; therefore we check for container types here to ensure
+  // that our format is used.
+  //
+  // The second argument of DefaultPrintTo() is needed to bypass a bug
+  // in Symbian's C++ compiler that prevents it from picking the right
+  // overload between:
+  //
+  //   PrintTo(const T& x, ...);
+  //   PrintTo(T* x, ...);
+  DefaultPrintTo(IsContainerTest<T>(0), is_pointer<T>(), value, os);
+}
+
+// The following list of PrintTo() overloads tells
+// UniversalPrinter<T>::Print() how to print standard types (built-in
+// types, strings, plain arrays, and pointers).
+
+// Overloads for various char types.
+GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
+GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
+inline void PrintTo(char c, ::std::ostream* os) {
+  // When printing a plain char, we always treat it as unsigned.  This
+  // way, the output won't be affected by whether the compiler thinks
+  // char is signed or not.
+  PrintTo(static_cast<unsigned char>(c), os);
+}
+
+// Overloads for other simple built-in types.
+inline void PrintTo(bool x, ::std::ostream* os) {
+  *os << (x ? "true" : "false");
+}
+
+// Overload for wchar_t type.
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its decimal code (except for L'\0').
+// The L'\0' char is printed as "L'\\0'". The decimal code is printed
+// as signed integer when wchar_t is implemented by the compiler
+// as a signed type and is printed as an unsigned integer when wchar_t
+// is implemented as an unsigned type.
+GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
+
+// Overloads for C strings.
+GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
+inline void PrintTo(char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char*>(s), os);
+}
+
+// signed/unsigned char is often used for representing binary data, so
+// we print pointers to it as void* to be safe.
+inline void PrintTo(const signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+
+// MSVC can be configured to define wchar_t as a typedef of unsigned
+// short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
+// type.  When wchar_t is a typedef, defining an overload for const
+// wchar_t* would cause unsigned short* be printed as a wide string,
+// possibly causing invalid memory accesses.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Overloads for wide C strings
+GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
+inline void PrintTo(wchar_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
+}
+#endif
+
+// Overload for C arrays.  Multi-dimensional arrays are printed
+// properly.
+
+// Prints the given number of elements in an array, without printing
+// the curly braces.
+template <typename T>
+void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
+  UniversalPrint(a[0], os);
+  for (size_t i = 1; i != count; i++) {
+    *os << ", ";
+    UniversalPrint(a[i], os);
+  }
+}
+
+// Overloads for ::string and ::std::string.
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os);
+inline void PrintTo(const ::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
+inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+
+// Overloads for ::wstring and ::std::wstring.
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+// Helper function for printing a tuple.  T must be instantiated with
+// a tuple type.
+template <typename T>
+void PrintTupleTo(const T& t, ::std::ostream* os);
+#endif  // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+
+#if GTEST_HAS_TR1_TUPLE
+// Overload for ::std::tr1::tuple.  Needed for printing function arguments,
+// which are packed as tuples.
+
+// Overloaded PrintTo() for tuples of various arities.  We support
+// tuples of up-to 10 fields.  The following implementation works
+// regardless of whether tr1::tuple is implemented using the
+// non-standard variadic template feature or not.
+
+inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1>
+void PrintTo(const ::std::tr1::tuple<T1>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2>
+void PrintTo(const ::std::tr1::tuple<T1, T2>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8, typename T9>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8, typename T9, typename T10>
+void PrintTo(
+    const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>& t,
+    ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+#endif  // GTEST_HAS_TR1_TUPLE
+
+#if GTEST_HAS_STD_TUPLE_
+template <typename... Types>
+void PrintTo(const ::std::tuple<Types...>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+#endif  // GTEST_HAS_STD_TUPLE_
+
+// Overload for std::pair.
+template <typename T1, typename T2>
+void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
+  *os << '(';
+  // We cannot use UniversalPrint(value.first, os) here, as T1 may be
+  // a reference type.  The same for printing value.second.
+  UniversalPrinter<T1>::Print(value.first, os);
+  *os << ", ";
+  UniversalPrinter<T2>::Print(value.second, os);
+  *os << ')';
+}
+
+// Implements printing a non-reference type T by letting the compiler
+// pick the right overload of PrintTo() for T.
+template <typename T>
+class UniversalPrinter {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+  // Note: we deliberately don't call this PrintTo(), as that name
+  // conflicts with ::testing::internal::PrintTo in the body of the
+  // function.
+  static void Print(const T& value, ::std::ostream* os) {
+    // By default, ::testing::internal::PrintTo() is used for printing
+    // the value.
+    //
+    // Thanks to Koenig look-up, if T is a class and has its own
+    // PrintTo() function defined in its namespace, that function will
+    // be visible here.  Since it is more specific than the generic ones
+    // in ::testing::internal, it will be picked by the compiler in the
+    // following statement - exactly what we want.
+    PrintTo(value, os);
+  }
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+// UniversalPrintArray(begin, len, os) prints an array of 'len'
+// elements, starting at address 'begin'.
+template <typename T>
+void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
+  if (len == 0) {
+    *os << "{}";
+  } else {
+    *os << "{ ";
+    const size_t kThreshold = 18;
+    const size_t kChunkSize = 8;
+    // If the array has more than kThreshold elements, we'll have to
+    // omit some details by printing only the first and the last
+    // kChunkSize elements.
+    // TODO(wan@google.com): let the user control the threshold using a flag.
+    if (len <= kThreshold) {
+      PrintRawArrayTo(begin, len, os);
+    } else {
+      PrintRawArrayTo(begin, kChunkSize, os);
+      *os << ", ..., ";
+      PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
+    }
+    *os << " }";
+  }
+}
+// This overload prints a (const) char array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const char* begin, size_t len, ::std::ostream* os);
+
+// This overload prints a (const) wchar_t array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const wchar_t* begin, size_t len, ::std::ostream* os);
+
+// Implements printing an array type T[N].
+template <typename T, size_t N>
+class UniversalPrinter<T[N]> {
+ public:
+  // Prints the given array, omitting some elements when there are too
+  // many.
+  static void Print(const T (&a)[N], ::std::ostream* os) {
+    UniversalPrintArray(a, N, os);
+  }
+};
+
+// Implements printing a reference type T&.
+template <typename T>
+class UniversalPrinter<T&> {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+  static void Print(const T& value, ::std::ostream* os) {
+    // Prints the address of the value.  We use reinterpret_cast here
+    // as static_cast doesn't compile when T is a function type.
+    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
+
+    // Then prints the value itself.
+    UniversalPrint(value, os);
+  }
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+// Prints a value tersely: for a reference type, the referenced value
+// (but not the address) is printed; for a (const) char pointer, the
+// NUL-terminated string (but not the pointer) is printed.
+
+template <typename T>
+class UniversalTersePrinter {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T>
+class UniversalTersePrinter<T&> {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T, size_t N>
+class UniversalTersePrinter<T[N]> {
+ public:
+  static void Print(const T (&value)[N], ::std::ostream* os) {
+    UniversalPrinter<T[N]>::Print(value, os);
+  }
+};
+template <>
+class UniversalTersePrinter<const char*> {
+ public:
+  static void Print(const char* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char*> {
+ public:
+  static void Print(char* str, ::std::ostream* os) {
+    UniversalTersePrinter<const char*>::Print(str, os);
+  }
+};
+
+#if GTEST_HAS_STD_WSTRING
+template <>
+class UniversalTersePrinter<const wchar_t*> {
+ public:
+  static void Print(const wchar_t* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::wstring(str), os);
+    }
+  }
+};
+#endif
+
+template <>
+class UniversalTersePrinter<wchar_t*> {
+ public:
+  static void Print(wchar_t* str, ::std::ostream* os) {
+    UniversalTersePrinter<const wchar_t*>::Print(str, os);
+  }
+};
+
+template <typename T>
+void UniversalTersePrint(const T& value, ::std::ostream* os) {
+  UniversalTersePrinter<T>::Print(value, os);
+}
+
+// Prints a value using the type inferred by the compiler.  The
+// difference between this and UniversalTersePrint() is that for a
+// (const) char pointer, this prints both the pointer and the
+// NUL-terminated string.
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os) {
+  // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
+  // UniversalPrinter with T directly.
+  typedef T T1;
+  UniversalPrinter<T1>::Print(value, os);
+}
+
+typedef ::std::vector<string> Strings;
+
+// TuplePolicy<TupleT> must provide:
+// - tuple_size
+//     size of tuple TupleT.
+// - get<size_t I>(const TupleT& t)
+//     static function extracting element I of tuple TupleT.
+// - tuple_element<size_t I>::type
+//     type of element I of tuple TupleT.
+template <typename TupleT>
+struct TuplePolicy;
+
+#if GTEST_HAS_TR1_TUPLE
+template <typename TupleT>
+struct TuplePolicy {
+  typedef TupleT Tuple;
+  static const size_t tuple_size = ::std::tr1::tuple_size<Tuple>::value;
+
+  template <size_t I>
+  struct tuple_element : ::std::tr1::tuple_element<I, Tuple> {};
+
+  template <size_t I>
+  static typename AddReference<
+      const typename ::std::tr1::tuple_element<I, Tuple>::type>::type get(
+      const Tuple& tuple) {
+    return ::std::tr1::get<I>(tuple);
+  }
+};
+template <typename TupleT>
+const size_t TuplePolicy<TupleT>::tuple_size;
+#endif  // GTEST_HAS_TR1_TUPLE
+
+#if GTEST_HAS_STD_TUPLE_
+template <typename... Types>
+struct TuplePolicy< ::std::tuple<Types...> > {
+  typedef ::std::tuple<Types...> Tuple;
+  static const size_t tuple_size = ::std::tuple_size<Tuple>::value;
+
+  template <size_t I>
+  struct tuple_element : ::std::tuple_element<I, Tuple> {};
+
+  template <size_t I>
+  static const typename ::std::tuple_element<I, Tuple>::type& get(
+      const Tuple& tuple) {
+    return ::std::get<I>(tuple);
+  }
+};
+template <typename... Types>
+const size_t TuplePolicy< ::std::tuple<Types...> >::tuple_size;
+#endif  // GTEST_HAS_STD_TUPLE_
+
+#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+// This helper template allows PrintTo() for tuples and
+// UniversalTersePrintTupleFieldsToStrings() to be defined by
+// induction on the number of tuple fields.  The idea is that
+// TuplePrefixPrinter<N>::PrintPrefixTo(t, os) prints the first N
+// fields in tuple t, and can be defined in terms of
+// TuplePrefixPrinter<N - 1>.
+//
+// The inductive case.
+template <size_t N>
+struct TuplePrefixPrinter {
+  // Prints the first N fields of a tuple.
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
+    TuplePrefixPrinter<N - 1>::PrintPrefixTo(t, os);
+    GTEST_INTENTIONAL_CONST_COND_PUSH_()
+    if (N > 1) {
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+      *os << ", ";
+    }
+    UniversalPrinter<
+        typename TuplePolicy<Tuple>::template tuple_element<N - 1>::type>
+        ::Print(TuplePolicy<Tuple>::template get<N - 1>(t), os);
+  }
+
+  // Tersely prints the first N fields of a tuple to a string vector,
+  // one element for each field.
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
+    TuplePrefixPrinter<N - 1>::TersePrintPrefixToStrings(t, strings);
+    ::std::stringstream ss;
+    UniversalTersePrint(TuplePolicy<Tuple>::template get<N - 1>(t), &ss);
+    strings->push_back(ss.str());
+  }
+};
+
+// Base case.
+template <>
+struct TuplePrefixPrinter<0> {
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple&, ::std::ostream*) {}
+
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple&, Strings*) {}
+};
+
+// Helper function for printing a tuple.
+// Tuple must be either std::tr1::tuple or std::tuple type.
+template <typename Tuple>
+void PrintTupleTo(const Tuple& t, ::std::ostream* os) {
+  *os << "(";
+  TuplePrefixPrinter<TuplePolicy<Tuple>::tuple_size>::PrintPrefixTo(t, os);
+  *os << ")";
+}
+
+// Prints the fields of a tuple tersely to a string vector, one
+// element for each field.  See the comment before
+// UniversalTersePrint() for how we define "tersely".
+template <typename Tuple>
+Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
+  Strings result;
+  TuplePrefixPrinter<TuplePolicy<Tuple>::tuple_size>::
+      TersePrintPrefixToStrings(value, &result);
+  return result;
+}
+#endif  // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+
+}  // namespace internal
+
+template <typename T>
+::std::string PrintToString(const T& value) {
+  ::std::stringstream ss;
+  internal::UniversalTersePrinter<T>::Print(value, &ss);
+  return ss.str();
+}
+
+}  // namespace testing
+
+// Include any custom printer added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gtest/internal/custom/gtest-printers.h"
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h
new file mode 100644
index 0000000000..f63fa9a1b2
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h
@@ -0,0 +1,232 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// Utilities for testing Google Test itself and code that uses Google Test
+// (e.g. frameworks built on top of Google Test).
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+#include "gtest/gtest.h"
+
+namespace testing {
+
+// This helper class can be used to mock out Google Test failure reporting
+// so that we can test Google Test or code that builds on Google Test.
+//
+// An object of this class appends a TestPartResult object to the
+// TestPartResultArray object given in the constructor whenever a Google Test
+// failure is reported. It can either intercept only failures that are
+// generated in the same thread that created this object or it can intercept
+// all generated failures. The scope of this mock object can be controlled with
+// the second argument to the two arguments constructor.
+class GTEST_API_ ScopedFakeTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  // The two possible mocking modes of this object.
+  enum InterceptMode {
+    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
+    INTERCEPT_ALL_THREADS           // Intercepts all failures.
+  };
+
+  // The c'tor sets this object as the test part result reporter used
+  // by Google Test.  The 'result' parameter specifies where to report the
+  // results. This reporter will only catch failures generated in the current
+  // thread. DEPRECATED
+  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
+
+  // Same as above, but you can choose the interception scope of this object.
+  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
+                                   TestPartResultArray* result);
+
+  // The d'tor restores the previous test part result reporter.
+  virtual ~ScopedFakeTestPartResultReporter();
+
+  // Appends the TestPartResult object to the TestPartResultArray
+  // received in the constructor.
+  //
+  // This method is from the TestPartResultReporterInterface
+  // interface.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+ private:
+  void Init();
+
+  const InterceptMode intercept_mode_;
+  TestPartResultReporterInterface* old_reporter_;
+  TestPartResultArray* const result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+};
+
+namespace internal {
+
+// A helper class for implementing EXPECT_FATAL_FAILURE() and
+// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+class GTEST_API_ SingleFailureChecker {
+ public:
+  // The constructor remembers the arguments.
+  SingleFailureChecker(const TestPartResultArray* results,
+                       TestPartResult::Type type,
+                       const string& substr);
+  ~SingleFailureChecker();
+ private:
+  const TestPartResultArray* const results_;
+  const TestPartResult::Type type_;
+  const string substr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+// A set of macros for testing Google Test assertions or code that's expected
+// to generate Google Test fatal failures.  It verifies that the given
+// statement will cause exactly one fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - 'statement' cannot reference local non-static variables or
+//     non-static members of the current object.
+//   - 'statement' cannot return a value.
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
+// gtest_unittest.cc will fail to compile if we do that.
+#define EXPECT_FATAL_FAILURE(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ALL_THREADS, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+// A macro for testing Google Test assertions or code that's expected to
+// generate Google Test non-fatal failures.  It asserts that the given
+// statement will cause exactly one non-fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// 'statement' is allowed to reference local variables and members of
+// the current object.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  If we do that, the code won't compile when the user gives
+// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
+// expands to code containing an unprotected comma.  The
+// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
+// catches that.
+//
+// For the same reason, we have to write
+//   if (::testing::internal::AlwaysTrue()) { statement; }
+// instead of
+//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+// to avoid an MSVC warning on unreachable code.
+#define EXPECT_NONFATAL_FAILURE(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h
new file mode 100644
index 0000000000..77eb844839
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h
@@ -0,0 +1,179 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+
+#include <iosfwd>
+#include <vector>
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+
+namespace testing {
+
+// A copyable object representing the result of a test part (i.e. an
+// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
+//
+// Don't inherit from TestPartResult as its destructor is not virtual.
+class GTEST_API_ TestPartResult {
+ public:
+  // The possible outcomes of a test part (i.e. an assertion or an
+  // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
+  enum Type {
+    kSuccess,          // Succeeded.
+    kNonFatalFailure,  // Failed but the test can continue.
+    kFatalFailure      // Failed and the test should be terminated.
+  };
+
+  // C'tor.  TestPartResult does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestPartResult object.
+  TestPartResult(Type a_type,
+                 const char* a_file_name,
+                 int a_line_number,
+                 const char* a_message)
+      : type_(a_type),
+        file_name_(a_file_name == NULL ? "" : a_file_name),
+        line_number_(a_line_number),
+        summary_(ExtractSummary(a_message)),
+        message_(a_message) {
+  }
+
+  // Gets the outcome of the test part.
+  Type type() const { return type_; }
+
+  // Gets the name of the source file where the test part took place, or
+  // NULL if it's unknown.
+  const char* file_name() const {
+    return file_name_.empty() ? NULL : file_name_.c_str();
+  }
+
+  // Gets the line in the source file where the test part took place,
+  // or -1 if it's unknown.
+  int line_number() const { return line_number_; }
+
+  // Gets the summary of the failure message.
+  const char* summary() const { return summary_.c_str(); }
+
+  // Gets the message associated with the test part.
+  const char* message() const { return message_.c_str(); }
+
+  // Returns true iff the test part passed.
+  bool passed() const { return type_ == kSuccess; }
+
+  // Returns true iff the test part failed.
+  bool failed() const { return type_ != kSuccess; }
+
+  // Returns true iff the test part non-fatally failed.
+  bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
+
+  // Returns true iff the test part fatally failed.
+  bool fatally_failed() const { return type_ == kFatalFailure; }
+
+ private:
+  Type type_;
+
+  // Gets the summary of the failure message by omitting the stack
+  // trace in it.
+  static std::string ExtractSummary(const char* message);
+
+  // The name of the source file where the test part took place, or
+  // "" if the source file is unknown.
+  std::string file_name_;
+  // The line in the source file where the test part took place, or -1
+  // if the line number is unknown.
+  int line_number_;
+  std::string summary_;  // The test failure summary.
+  std::string message_;  // The test failure message.
+};
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
+
+// An array of TestPartResult objects.
+//
+// Don't inherit from TestPartResultArray as its destructor is not
+// virtual.
+class GTEST_API_ TestPartResultArray {
+ public:
+  TestPartResultArray() {}
+
+  // Appends the given TestPartResult to the array.
+  void Append(const TestPartResult& result);
+
+  // Returns the TestPartResult at the given index (0-based).
+  const TestPartResult& GetTestPartResult(int index) const;
+
+  // Returns the number of TestPartResult objects in the array.
+  int size() const;
+
+ private:
+  std::vector<TestPartResult> array_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
+};
+
+// This interface knows how to report a test part result.
+class TestPartResultReporterInterface {
+ public:
+  virtual ~TestPartResultReporterInterface() {}
+
+  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
+};
+
+namespace internal {
+
+// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
+// statement generates new fatal failures. To do so it registers itself as the
+// current test part result reporter. Besides checking if fatal failures were
+// reported, it only delegates the reporting to the former result reporter.
+// The original result reporter is restored in the destructor.
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+class GTEST_API_ HasNewFatalFailureHelper
+    : public TestPartResultReporterInterface {
+ public:
+  HasNewFatalFailureHelper();
+  virtual ~HasNewFatalFailureHelper();
+  virtual void ReportTestPartResult(const TestPartResult& result);
+  bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+ private:
+  bool has_new_fatal_failure_;
+  TestPartResultReporterInterface* original_reporter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h
new file mode 100644
index 0000000000..5f69d5678e
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h
@@ -0,0 +1,263 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// This header implements typed tests and type-parameterized tests.
+
+// Typed (aka type-driven) tests repeat the same test for types in a
+// list.  You must know which types you want to test with when writing
+// typed tests. Here's how you do it:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+ public:
+  ...
+  typedef std::list<T> List;
+  static T shared_;
+  T value_;
+};
+
+// Next, associate a list of types with the test case, which will be
+// repeated for each type in the list.  The typedef is necessary for
+// the macro to parse correctly.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+TYPED_TEST_CASE(FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   TYPED_TEST_CASE(FooTest, int);
+
+// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
+// tests for this test case as you want.
+TYPED_TEST(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  // Since we are inside a derived class template, C++ requires use to
+  // visit the members of FooTest via 'this'.
+  TypeParam n = this->value_;
+
+  // To visit static members of the fixture, add the TestFixture::
+  // prefix.
+  n += TestFixture::shared_;
+
+  // To refer to typedefs in the fixture, add the "typename
+  // TestFixture::" prefix.
+  typename TestFixture::List values;
+  values.push_back(n);
+  ...
+}
+
+TYPED_TEST(FooTest, HasPropertyA) { ... }
+
+#endif  // 0
+
+// Type-parameterized tests are abstract test patterns parameterized
+// by a type.  Compared with typed tests, type-parameterized tests
+// allow you to define the test pattern without knowing what the type
+// parameters are.  The defined pattern can be instantiated with
+// different types any number of times, in any number of translation
+// units.
+//
+// If you are designing an interface or concept, you can define a
+// suite of type-parameterized tests to verify properties that any
+// valid implementation of the interface/concept should have.  Then,
+// each implementation can easily instantiate the test suite to verify
+// that it conforms to the requirements, without having to write
+// similar tests repeatedly.  Here's an example:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+  ...
+};
+
+// Next, declare that you will define a type-parameterized test case
+// (the _P suffix is for "parameterized" or "pattern", whichever you
+// prefer):
+TYPED_TEST_CASE_P(FooTest);
+
+// Then, use TYPED_TEST_P() to define as many type-parameterized tests
+// for this type-parameterized test case as you want.
+TYPED_TEST_P(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  TypeParam n = 0;
+  ...
+}
+
+TYPED_TEST_P(FooTest, HasPropertyA) { ... }
+
+// Now the tricky part: you need to register all test patterns before
+// you can instantiate them.  The first argument of the macro is the
+// test case name; the rest are the names of the tests in this test
+// case.
+REGISTER_TYPED_TEST_CASE_P(FooTest,
+                           DoesBlah, HasPropertyA);
+
+// Finally, you are free to instantiate the pattern with the types you
+// want.  If you put the above code in a header file, you can #include
+// it in multiple C++ source files and instantiate it multiple times.
+//
+// To distinguish different instances of the pattern, the first
+// argument to the INSTANTIATE_* macro is a prefix that will be added
+// to the actual test case name.  Remember to pick unique prefixes for
+// different instances.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int);
+
+#endif  // 0
+
+#include "gtest/internal/gtest-port.h"
+#include "gtest/internal/gtest-type-util.h"
+
+// Implements typed tests.
+
+#if GTEST_HAS_TYPED_TEST
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the typedef for the type parameters of the
+// given test case.
+# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_
+
+// The 'Types' template argument below must have spaces around it
+// since some compilers may choke on '>>' when passing a template
+// instance (e.g. Types<int>)
+# define TYPED_TEST_CASE(CaseName, Types) \
+  typedef ::testing::internal::TypeList< Types >::type \
+      GTEST_TYPE_PARAMS_(CaseName)
+
+# define TYPED_TEST(CaseName, TestName) \
+  template <typename gtest_TypeParam_> \
+  class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \
+      : public CaseName<gtest_TypeParam_> { \
+   private: \
+    typedef CaseName<gtest_TypeParam_> TestFixture; \
+    typedef gtest_TypeParam_ TypeParam; \
+    virtual void TestBody(); \
+  }; \
+  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::internal::TypeParameterizedTest< \
+          CaseName, \
+          ::testing::internal::TemplateSel< \
+              GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \
+          GTEST_TYPE_PARAMS_(CaseName)>::Register(\
+              "", ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+              #CaseName, #TestName, 0); \
+  template <typename gtest_TypeParam_> \
+  void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody()
+
+#endif  // GTEST_HAS_TYPED_TEST
+
+// Implements type-parameterized tests.
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the namespace name that the type-parameterized tests for
+// the given type-parameterized test case are defined in.  The exact
+// name of the namespace is subject to change without notice.
+# define GTEST_CASE_NAMESPACE_(TestCaseName) \
+  gtest_case_##TestCaseName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the variable used to remember the names of
+// the defined tests in the given test case.
+# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \
+  gtest_typed_test_case_p_state_##TestCaseName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
+//
+// Expands to the name of the variable used to remember the names of
+// the registered tests in the given test case.
+# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \
+  gtest_registered_test_names_##TestCaseName##_
+
+// The variables defined in the type-parameterized test macros are
+// static as typically these macros are used in a .h file that can be
+// #included in multiple translation units linked together.
+# define TYPED_TEST_CASE_P(CaseName) \
+  static ::testing::internal::TypedTestCasePState \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName)
+
+# define TYPED_TEST_P(CaseName, TestName) \
+  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
+  template <typename gtest_TypeParam_> \
+  class TestName : public CaseName<gtest_TypeParam_> { \
+   private: \
+    typedef CaseName<gtest_TypeParam_> TestFixture; \
+    typedef gtest_TypeParam_ TypeParam; \
+    virtual void TestBody(); \
+  }; \
+  static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\
+          __FILE__, __LINE__, #CaseName, #TestName); \
+  } \
+  template <typename gtest_TypeParam_> \
+  void GTEST_CASE_NAMESPACE_(CaseName)::TestName<gtest_TypeParam_>::TestBody()
+
+# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \
+  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
+  typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
+  } \
+  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\
+          __FILE__, __LINE__, #__VA_ARGS__)
+
+// The 'Types' template argument below must have spaces around it
+// since some compilers may choke on '>>' when passing a template
+// instance (e.g. Types<int>)
+# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \
+  bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::internal::TypeParameterizedTestCase<CaseName, \
+          GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \
+          ::testing::internal::TypeList< Types >::type>::Register(\
+              #Prefix, \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+              &GTEST_TYPED_TEST_CASE_P_STATE_(CaseName), \
+              #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h
index 581a44e95f..f846c5bd66 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h
@@ -55,17442 +55,29 @@
 #include <ostream>
 #include <vector>
 
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file declares functions and macros used internally by
-// Google Test.  They are subject to change without notice.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: wan@google.com (Zhanyong Wan)
-//
-// Low-level types and utilities for porting Google Test to various
-// platforms.  They are subject to change without notice.  DO NOT USE
-// THEM IN USER CODE.
-//
-// This file is fundamental to Google Test.  All other Google Test source
-// files are expected to #include this.  Therefore, it cannot #include
-// any other Google Test header.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-
-// The user can define the following macros in the build script to
-// control Google Test's behavior.  If the user doesn't define a macro
-// in this list, Google Test will define it.
-//
-//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
-//                              is/isn't available.
-//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
-//                              are enabled.
-//   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
-//                              is/isn't available (some systems define
-//                              ::string, which is different to std::string).
-//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
-//                              is/isn't available (some systems define
-//                              ::wstring, which is different to std::wstring).
-//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
-//                              expressions are/aren't available.
-//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
-//                              is/isn't available.
-//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
-//                              enabled.
-//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
-//                              std::wstring does/doesn't work (Google Test can
-//                              be used where std::wstring is unavailable).
-//   GTEST_HAS_TR1_TUPLE      - Define it to 1/0 to indicate tr1::tuple
-//                              is/isn't available.
-//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
-//                              compiler supports Microsoft's "Structured
-//                              Exception Handling".
-//   GTEST_HAS_STREAM_REDIRECTION
-//                            - Define it to 1/0 to indicate whether the
-//                              platform supports I/O stream redirection using
-//                              dup() and dup2().
-//   GTEST_USE_OWN_TR1_TUPLE  - Define it to 1/0 to indicate whether Google
-//                              Test's own tr1 tuple implementation should be
-//                              used.  Unused when the user sets
-//                              GTEST_HAS_TR1_TUPLE to 0.
-//   GTEST_LANG_CXX11         - Define it to 1/0 to indicate that Google Test
-//                              is building in C++11/C++98 mode.
-//   GTEST_LINKED_AS_SHARED_LIBRARY
-//                            - Define to 1 when compiling tests that use
-//                              Google Test as a shared library (known as
-//                              DLL on Windows).
-//   GTEST_CREATE_SHARED_LIBRARY
-//                            - Define to 1 when compiling Google Test itself
-//                              as a shared library.
-
-// This header defines the following utilities:
-//
-// Macros indicating the current platform (defined to 1 if compiled on
-// the given platform; otherwise undefined):
-//   GTEST_OS_AIX      - IBM AIX
-//   GTEST_OS_CYGWIN   - Cygwin
-//   GTEST_OS_HPUX     - HP-UX
-//   GTEST_OS_LINUX    - Linux
-//     GTEST_OS_LINUX_ANDROID - Google Android
-//   GTEST_OS_MAC      - Mac OS X
-//     GTEST_OS_IOS    - iOS
-//       GTEST_OS_IOS_SIMULATOR - iOS simulator
-//   GTEST_OS_NACL     - Google Native Client (NaCl)
-//   GTEST_OS_OPENBSD  - OpenBSD
-//   GTEST_OS_QNX      - QNX
-//   GTEST_OS_SOLARIS  - Sun Solaris
-//   GTEST_OS_SYMBIAN  - Symbian
-//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
-//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
-//     GTEST_OS_WINDOWS_MINGW    - MinGW
-//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
-//   GTEST_OS_ZOS      - z/OS
-//
-// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
-// most stable support.  Since core members of the Google Test project
-// don't have access to other platforms, support for them may be less
-// stable.  If you notice any problems on your platform, please notify
-// googletestframework@googlegroups.com (patches for fixing them are
-// even more welcome!).
-//
-// Note that it is possible that none of the GTEST_OS_* macros are defined.
-//
-// Macros indicating available Google Test features (defined to 1 if
-// the corresponding feature is supported; otherwise undefined):
-//   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
-//                            tests)
-//   GTEST_HAS_DEATH_TEST   - death tests
-//   GTEST_HAS_PARAM_TEST   - value-parameterized tests
-//   GTEST_HAS_TYPED_TEST   - typed tests
-//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
-//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
-//                            GTEST_HAS_POSIX_RE (see above) which users can
-//                            define themselves.
-//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
-//                            the above two are mutually exclusive.
-//   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
-//
-// Macros for basic C++ coding:
-//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
-//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
-//                              variable don't have to be used.
-//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
-//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
-//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
-//
-// Synchronization:
-//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
-//                  - synchronization primitives.
-//   GTEST_IS_THREADSAFE - defined to 1 to indicate that the above
-//                         synchronization primitives have real implementations
-//                         and Google Test is thread-safe; or 0 otherwise.
-//
-// Template meta programming:
-//   is_pointer     - as in TR1; needed on Symbian and IBM XL C/C++ only.
-//   IteratorTraits - partial implementation of std::iterator_traits, which
-//                    is not available in libCstd when compiled with Sun C++.
-//
-// Smart pointers:
-//   scoped_ptr     - as in TR2.
-//
-// Regular expressions:
-//   RE             - a simple regular expression class using the POSIX
-//                    Extended Regular Expression syntax on UNIX-like
-//                    platforms, or a reduced regular exception syntax on
-//                    other platforms, including Windows.
-//
-// Logging:
-//   GTEST_LOG_()   - logs messages at the specified severity level.
-//   LogToStderr()  - directs all log messages to stderr.
-//   FlushInfoLog() - flushes informational log messages.
-//
-// Stdout and stderr capturing:
-//   CaptureStdout()     - starts capturing stdout.
-//   GetCapturedStdout() - stops capturing stdout and returns the captured
-//                         string.
-//   CaptureStderr()     - starts capturing stderr.
-//   GetCapturedStderr() - stops capturing stderr and returns the captured
-//                         string.
-//
-// Integer types:
-//   TypeWithSize   - maps an integer to a int type.
-//   Int32, UInt32, Int64, UInt64, TimeInMillis
-//                  - integers of known sizes.
-//   BiggestInt     - the biggest signed integer type.
-//
-// Command-line utilities:
-//   GTEST_FLAG()       - references a flag.
-//   GTEST_DECLARE_*()  - declares a flag.
-//   GTEST_DEFINE_*()   - defines a flag.
-//   GetInjectableArgvs() - returns the command line as a vector of strings.
-//
-// Environment variable utilities:
-//   GetEnv()             - gets the value of an environment variable.
-//   BoolFromGTestEnv()   - parses a bool environment variable.
-//   Int32FromGTestEnv()  - parses an Int32 environment variable.
-//   StringFromGTestEnv() - parses a string environment variable.
-
-#include <ctype.h>   // for isspace, etc
-#include <stddef.h>  // for ptrdiff_t
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#ifndef _WIN32_WCE
-# include <sys/types.h>
-# include <sys/stat.h>
-#endif  // !_WIN32_WCE
-
-#if defined __APPLE__
-# include <AvailabilityMacros.h>
-# include <TargetConditionals.h>
-#endif
-
-#include <iostream>  // NOLINT
-#include <sstream>  // NOLINT
-#include <string>  // NOLINT
-
-#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
-#define GTEST_FLAG_PREFIX_ "gtest_"
-#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
-#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
-#define GTEST_NAME_ "Google Test"
-#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/"
-
-// Determines the version of gcc that is used to compile this.
-#ifdef __GNUC__
-// 40302 means version 4.3.2.
-# define GTEST_GCC_VER_ \
-    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
-#endif  // __GNUC__
-
-// Determines the platform on which Google Test is compiled.
-#ifdef __CYGWIN__
-# define GTEST_OS_CYGWIN 1
-#elif defined __SYMBIAN32__
-# define GTEST_OS_SYMBIAN 1
-#elif defined _WIN32
-# define GTEST_OS_WINDOWS 1
-# ifdef _WIN32_WCE
-#  define GTEST_OS_WINDOWS_MOBILE 1
-# elif defined(__MINGW__) || defined(__MINGW32__)
-#  define GTEST_OS_WINDOWS_MINGW 1
-# else
-#  define GTEST_OS_WINDOWS_DESKTOP 1
-# endif  // _WIN32_WCE
-#elif defined __APPLE__
-# define GTEST_OS_MAC 1
-# if TARGET_OS_IPHONE
-#  define GTEST_OS_IOS 1
-#  if TARGET_IPHONE_SIMULATOR
-#   define GTEST_OS_IOS_SIMULATOR 1
-#  endif
-# endif
-#elif defined __linux__
-# define GTEST_OS_LINUX 1
-# if defined __ANDROID__
-#  define GTEST_OS_LINUX_ANDROID 1
-# endif
-#elif defined __MVS__
-# define GTEST_OS_ZOS 1
-#elif defined(__sun) && defined(__SVR4)
-# define GTEST_OS_SOLARIS 1
-#elif defined(_AIX)
-# define GTEST_OS_AIX 1
-#elif defined(__hpux)
-# define GTEST_OS_HPUX 1
-#elif defined __native_client__
-# define GTEST_OS_NACL 1
-#elif defined __OpenBSD__
-# define GTEST_OS_OPENBSD 1
-#elif defined __QNX__
-# define GTEST_OS_QNX 1
-#endif  // __CYGWIN__
-
-#ifndef GTEST_LANG_CXX11
-// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
-// -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
-// value for __cplusplus, and recent versions of clang, gcc, and
-// probably other compilers set that too in C++11 mode.
-# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
-// Compiling in at least C++11 mode.
-#  define GTEST_LANG_CXX11 1
-# else
-#  define GTEST_LANG_CXX11 0
-# endif
-#endif
-
-// Brings in definitions for functions used in the testing::internal::posix
-// namespace (read, write, close, chdir, isatty, stat). We do not currently
-// use them on Windows Mobile.
-#if !GTEST_OS_WINDOWS
-// This assumes that non-Windows OSes provide unistd.h. For OSes where this
-// is not the case, we need to include headers that provide the functions
-// mentioned above.
-# include <unistd.h>
-# include <strings.h>
-#elif !GTEST_OS_WINDOWS_MOBILE
-# include <direct.h>
-# include <io.h>
-#endif
-
-#if GTEST_OS_LINUX_ANDROID
-// Used to define __ANDROID_API__ matching the target NDK API level.
-#  include <android/api-level.h>  // NOLINT
-#endif
-
-// Defines this to true iff Google Test can use POSIX regular expressions.
-#ifndef GTEST_HAS_POSIX_RE
-# if GTEST_OS_LINUX_ANDROID
-// On Android, <regex.h> is only available starting with Gingerbread.
-#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
-# else
-#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
-# endif
-#endif
-
-#if GTEST_HAS_POSIX_RE
-
-// On some platforms, <regex.h> needs someone to define size_t, and
-// won't compile otherwise.  We can #include it here as we already
-// included <stdlib.h>, which is guaranteed to define size_t through
-// <stddef.h>.
-# include <regex.h>  // NOLINT
-
-# define GTEST_USES_POSIX_RE 1
-
-#elif GTEST_OS_WINDOWS
-
-// <regex.h> is not available on Windows.  Use our own simple regex
-// implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
-#else
-
-// <regex.h> may not be available on this platform.  Use our own
-// simple regex implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
-#endif  // GTEST_HAS_POSIX_RE
-
-#ifndef GTEST_HAS_EXCEPTIONS
-// The user didn't tell us whether exceptions are enabled, so we need
-// to figure it out.
-# if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
-// macro to enable exceptions, so we'll do the same.
-// Assumes that exceptions are enabled by default.
-#  ifndef _HAS_EXCEPTIONS
-#   define _HAS_EXCEPTIONS 1
-#  endif  // _HAS_EXCEPTIONS
-#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
-# elif defined(__GNUC__) && __EXCEPTIONS
-// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__SUNPRO_CC)
-// Sun Pro CC supports exceptions.  However, there is no compile-time way of
-// detecting whether they are enabled or not.  Therefore, we assume that
-// they are enabled unless the user tells us otherwise.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__IBMCPP__) && __EXCEPTIONS
-// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__HP_aCC)
-// Exception handling is in effect by default in HP aCC compiler. It has to
-// be turned of by +noeh compiler option if desired.
-#  define GTEST_HAS_EXCEPTIONS 1
-# else
-// For other compilers, we assume exceptions are disabled to be
-// conservative.
-#  define GTEST_HAS_EXCEPTIONS 0
-# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
-#endif  // GTEST_HAS_EXCEPTIONS
-
-#if !defined(GTEST_HAS_STD_STRING)
-// Even though we don't use this macro any longer, we keep it in case
-// some clients still depend on it.
-# define GTEST_HAS_STD_STRING 1
-#elif !GTEST_HAS_STD_STRING
-// The user told us that ::std::string isn't available.
-# error "Google Test cannot be used where ::std::string isn't available."
-#endif  // !defined(GTEST_HAS_STD_STRING)
-
-#ifndef GTEST_HAS_GLOBAL_STRING
-// The user didn't tell us whether ::string is available, so we need
-// to figure it out.
-
-# define GTEST_HAS_GLOBAL_STRING 0
-
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-#ifndef GTEST_HAS_STD_WSTRING
-// The user didn't tell us whether ::std::wstring is available, so we need
-// to figure it out.
-// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
-//   is available.
-
-// Cygwin 1.7 and below doesn't support ::std::wstring.
-// Solaris' libc++ doesn't support it either.  Android has
-// no support for it at least as recent as Froyo (2.2).
-# define GTEST_HAS_STD_WSTRING \
-    (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS))
-
-#endif  // GTEST_HAS_STD_WSTRING
-
-#ifndef GTEST_HAS_GLOBAL_WSTRING
-// The user didn't tell us whether ::wstring is available, so we need
-// to figure it out.
-# define GTEST_HAS_GLOBAL_WSTRING \
-    (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING)
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-// Determines whether RTTI is available.
-#ifndef GTEST_HAS_RTTI
-// The user didn't tell us whether RTTI is enabled, so we need to
-// figure it out.
-
-# ifdef _MSC_VER
-
-#  ifdef _CPPRTTI  // MSVC defines this macro iff RTTI is enabled.
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
-
-// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
-# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
-
-#  ifdef __GXX_RTTI
-// When building against STLport with the Android NDK and with
-// -frtti -fno-exceptions, the build fails at link time with undefined
-// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
-// so disable RTTI when detected.
-#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
-       !defined(__EXCEPTIONS)
-#    define GTEST_HAS_RTTI 0
-#   else
-#    define GTEST_HAS_RTTI 1
-#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif  // __GXX_RTTI
-
-// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
-// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
-// first version with C++ support.
-# elif defined(__clang__)
-
-#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
-
-// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
-// both the typeid and dynamic_cast features are present.
-# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
-
-#  ifdef __RTTI_ALL__
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
-
-# else
-
-// For all other compilers, we assume RTTI is enabled.
-#  define GTEST_HAS_RTTI 1
-
-# endif  // _MSC_VER
-
-#endif  // GTEST_HAS_RTTI
-
-// It's this header's responsibility to #include <typeinfo> when RTTI
-// is enabled.
-#if GTEST_HAS_RTTI
-# include <typeinfo>
-#endif
-
-// Determines whether Google Test can use the pthreads library.
-#ifndef GTEST_HAS_PTHREAD
-// The user didn't tell us explicitly, so we assume pthreads support is
-// available on Linux and Mac.
-//
-// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
-// to your compiler flags.
-# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
-    || GTEST_OS_QNX)
-#endif  // GTEST_HAS_PTHREAD
-
-#if GTEST_HAS_PTHREAD
-// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
-// true.
-# include <pthread.h>  // NOLINT
-
-// For timespec and nanosleep, used below.
-# include <time.h>  // NOLINT
-#endif
-
-// Determines whether Google Test can use tr1/tuple.  You can define
-// this macro to 0 to prevent Google Test from using tuple (any
-// feature depending on tuple with be disabled in this mode).
-#ifndef GTEST_HAS_TR1_TUPLE
-# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
-// STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
-#  define GTEST_HAS_TR1_TUPLE 0
-# else
-// The user didn't tell us not to do it, so we assume it's OK.
-#  define GTEST_HAS_TR1_TUPLE 1
-# endif
-#endif  // GTEST_HAS_TR1_TUPLE
-
-// Determines whether Google Test's own tr1 tuple implementation
-// should be used.
-#ifndef GTEST_USE_OWN_TR1_TUPLE
-// The user didn't tell us, so we need to figure it out.
-
-// We use our own TR1 tuple if we aren't sure the user has an
-// implementation of it already.  At this time, libstdc++ 4.0.0+ and
-// MSVC 2010 are the only mainstream standard libraries that come
-// with a TR1 tuple implementation.  NVIDIA's CUDA NVCC compiler
-// pretends to be GCC by defining __GNUC__ and friends, but cannot
-// compile GCC's tuple implementation.  MSVC 2008 (9.0) provides TR1
-// tuple in a 323 MB Feature Pack download, which we cannot assume the
-// user has.  QNX's QCC compiler is a modified GCC but it doesn't
-// support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
-// and it can be used with some compilers that define __GNUC__.
-# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
-      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
-#  define GTEST_ENV_HAS_TR1_TUPLE_ 1
-# endif
-
-// C++11 specifies that <tuple> provides std::tuple. Use that if gtest is used
-// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6
-// can build with clang but need to use gcc4.2's libstdc++).
-# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325)
-#  define GTEST_ENV_HAS_STD_TUPLE_ 1
-# endif
-
-# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_
-#  define GTEST_USE_OWN_TR1_TUPLE 0
-# else
-#  define GTEST_USE_OWN_TR1_TUPLE 1
-# endif
-
-#endif  // GTEST_USE_OWN_TR1_TUPLE
-
-// To avoid conditional compilation everywhere, we make it
-// gtest-port.h's responsibility to #include the header implementing
-// tr1/tuple.
-#if GTEST_HAS_TR1_TUPLE
-
-# if GTEST_USE_OWN_TR1_TUPLE
-// This file was GENERATED by command:
-//     pump.py gtest-tuple.h.pump
-// DO NOT EDIT BY HAND!!!
-
-// Copyright 2009 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-
-#include <utility>  // For ::std::pair.
-
-// The compiler used in Symbian has a bug that prevents us from declaring the
-// tuple template as a friend (it complains that tuple is redefined).  This
-// hack bypasses the bug by declaring the members that should otherwise be
-// private as public.
-// Sun Studio versions < 12 also have the above bug.
-#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
-# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
-#else
-# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
-    template <GTEST_10_TYPENAMES_(U)> friend class tuple; \
-   private:
-#endif
-
-// GTEST_n_TUPLE_(T) is the type of an n-tuple.
-#define GTEST_0_TUPLE_(T) tuple<>
-#define GTEST_1_TUPLE_(T) tuple<T##0, void, void, void, void, void, void, \
-    void, void, void>
-#define GTEST_2_TUPLE_(T) tuple<T##0, T##1, void, void, void, void, void, \
-    void, void, void>
-#define GTEST_3_TUPLE_(T) tuple<T##0, T##1, T##2, void, void, void, void, \
-    void, void, void>
-#define GTEST_4_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, void, void, void, \
-    void, void, void>
-#define GTEST_5_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, void, void, \
-    void, void, void>
-#define GTEST_6_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, void, \
-    void, void, void>
-#define GTEST_7_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    void, void, void>
-#define GTEST_8_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    T##7, void, void>
-#define GTEST_9_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    T##7, T##8, void>
-#define GTEST_10_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    T##7, T##8, T##9>
-
-// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
-#define GTEST_0_TYPENAMES_(T)
-#define GTEST_1_TYPENAMES_(T) typename T##0
-#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1
-#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2
-#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3
-#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4
-#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5
-#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6
-#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6, typename T##7
-#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6, \
-    typename T##7, typename T##8
-#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6, \
-    typename T##7, typename T##8, typename T##9
-
-// In theory, defining stuff in the ::std namespace is undefined
-// behavior.  We can do this as we are playing the role of a standard
-// library vendor.
-namespace std {
-namespace tr1 {
-
-template <typename T0 = void, typename T1 = void, typename T2 = void,
-    typename T3 = void, typename T4 = void, typename T5 = void,
-    typename T6 = void, typename T7 = void, typename T8 = void,
-    typename T9 = void>
-class tuple;
-
-// Anything in namespace gtest_internal is Google Test's INTERNAL
-// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
-namespace gtest_internal {
-
-// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
-template <typename T>
-struct ByRef { typedef const T& type; };  // NOLINT
-template <typename T>
-struct ByRef<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper for ByRef.
-#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
-
-// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
-// is the same as tr1::add_reference<T>::type.
-template <typename T>
-struct AddRef { typedef T& type; };  // NOLINT
-template <typename T>
-struct AddRef<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper for AddRef.
-#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
-
-// A helper for implementing get<k>().
-template <int k> class Get;
-
-// A helper for implementing tuple_element<k, T>.  kIndexValid is true
-// iff k < the number of fields in tuple type T.
-template <bool kIndexValid, int kIndex, class Tuple>
-struct TupleElement;
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 0, GTEST_10_TUPLE_(T) > {
-  typedef T0 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 1, GTEST_10_TUPLE_(T) > {
-  typedef T1 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 2, GTEST_10_TUPLE_(T) > {
-  typedef T2 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 3, GTEST_10_TUPLE_(T) > {
-  typedef T3 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 4, GTEST_10_TUPLE_(T) > {
-  typedef T4 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 5, GTEST_10_TUPLE_(T) > {
-  typedef T5 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 6, GTEST_10_TUPLE_(T) > {
-  typedef T6 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 7, GTEST_10_TUPLE_(T) > {
-  typedef T7 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 8, GTEST_10_TUPLE_(T) > {
-  typedef T8 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 9, GTEST_10_TUPLE_(T) > {
-  typedef T9 type;
-};
-
-}  // namespace gtest_internal
-
-template <>
-class tuple<> {
- public:
-  tuple() {}
-  tuple(const tuple& /* t */)  {}
-  tuple& operator=(const tuple& /* t */) { return *this; }
-};
-
-template <GTEST_1_TYPENAMES_(T)>
-class GTEST_1_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {}
-
-  tuple(const tuple& t) : f0_(t.f0_) {}
-
-  template <GTEST_1_TYPENAMES_(U)>
-  tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_1_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_1_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_1_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    return *this;
-  }
-
-  T0 f0_;
-};
-
-template <GTEST_2_TYPENAMES_(T)>
-class GTEST_2_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0),
-      f1_(f1) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {}
-
-  template <GTEST_2_TYPENAMES_(U)>
-  tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {}
-  template <typename U0, typename U1>
-  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_2_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_2_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-  template <typename U0, typename U1>
-  tuple& operator=(const ::std::pair<U0, U1>& p) {
-    f0_ = p.first;
-    f1_ = p.second;
-    return *this;
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_2_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-};
-
-template <GTEST_3_TYPENAMES_(T)>
-class GTEST_3_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
-
-  template <GTEST_3_TYPENAMES_(U)>
-  tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_3_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_3_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_3_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-};
-
-template <GTEST_4_TYPENAMES_(T)>
-class GTEST_4_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2),
-      f3_(f3) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {}
-
-  template <GTEST_4_TYPENAMES_(U)>
-  tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_4_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_4_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_4_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-};
-
-template <GTEST_5_TYPENAMES_(T)>
-class GTEST_5_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3,
-      GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_) {}
-
-  template <GTEST_5_TYPENAMES_(U)>
-  tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_5_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_5_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_5_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-};
-
-template <GTEST_6_TYPENAMES_(T)>
-class GTEST_6_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
-      f5_(f5) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_) {}
-
-  template <GTEST_6_TYPENAMES_(U)>
-  tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_6_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_6_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_6_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-};
-
-template <GTEST_7_TYPENAMES_(T)>
-class GTEST_7_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2),
-      f3_(f3), f4_(f4), f5_(f5), f6_(f6) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
-
-  template <GTEST_7_TYPENAMES_(U)>
-  tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_7_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_7_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_7_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-};
-
-template <GTEST_8_TYPENAMES_(T)>
-class GTEST_8_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6,
-      GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
-      f5_(f5), f6_(f6), f7_(f7) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
-
-  template <GTEST_8_TYPENAMES_(U)>
-  tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_8_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_8_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_8_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    f7_ = t.f7_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-  T7 f7_;
-};
-
-template <GTEST_9_TYPENAMES_(T)>
-class GTEST_9_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
-      GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
-      f5_(f5), f6_(f6), f7_(f7), f8_(f8) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
-
-  template <GTEST_9_TYPENAMES_(U)>
-  tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_9_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_9_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_9_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    f7_ = t.f7_;
-    f8_ = t.f8_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-  T7 f7_;
-  T8 f8_;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-class tuple {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(),
-      f9_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
-      GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2),
-      f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {}
-
-  template <GTEST_10_TYPENAMES_(U)>
-  tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_),
-      f9_(t.f9_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_10_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_10_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_10_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    f7_ = t.f7_;
-    f8_ = t.f8_;
-    f9_ = t.f9_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-  T7 f7_;
-  T8 f8_;
-  T9 f9_;
-};
-
-// 6.1.3.2 Tuple creation functions.
-
-// Known limitations: we don't support passing an
-// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
-// implement tie().
-
-inline tuple<> make_tuple() { return tuple<>(); }
-
-template <GTEST_1_TYPENAMES_(T)>
-inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) {
-  return GTEST_1_TUPLE_(T)(f0);
-}
-
-template <GTEST_2_TYPENAMES_(T)>
-inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) {
-  return GTEST_2_TUPLE_(T)(f0, f1);
-}
-
-template <GTEST_3_TYPENAMES_(T)>
-inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) {
-  return GTEST_3_TUPLE_(T)(f0, f1, f2);
-}
-
-template <GTEST_4_TYPENAMES_(T)>
-inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3) {
-  return GTEST_4_TUPLE_(T)(f0, f1, f2, f3);
-}
-
-template <GTEST_5_TYPENAMES_(T)>
-inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4) {
-  return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4);
-}
-
-template <GTEST_6_TYPENAMES_(T)>
-inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5) {
-  return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5);
-}
-
-template <GTEST_7_TYPENAMES_(T)>
-inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6) {
-  return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6);
-}
-
-template <GTEST_8_TYPENAMES_(T)>
-inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) {
-  return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7);
-}
-
-template <GTEST_9_TYPENAMES_(T)>
-inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
-    const T8& f8) {
-  return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8);
-}
-
-template <GTEST_10_TYPENAMES_(T)>
-inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
-    const T8& f8, const T9& f9) {
-  return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9);
-}
-
-// 6.1.3.3 Tuple helper classes.
-
-template <typename Tuple> struct tuple_size;
-
-template <GTEST_0_TYPENAMES_(T)>
-struct tuple_size<GTEST_0_TUPLE_(T) > {
-  static const int value = 0;
-};
-
-template <GTEST_1_TYPENAMES_(T)>
-struct tuple_size<GTEST_1_TUPLE_(T) > {
-  static const int value = 1;
-};
-
-template <GTEST_2_TYPENAMES_(T)>
-struct tuple_size<GTEST_2_TUPLE_(T) > {
-  static const int value = 2;
-};
-
-template <GTEST_3_TYPENAMES_(T)>
-struct tuple_size<GTEST_3_TUPLE_(T) > {
-  static const int value = 3;
-};
-
-template <GTEST_4_TYPENAMES_(T)>
-struct tuple_size<GTEST_4_TUPLE_(T) > {
-  static const int value = 4;
-};
-
-template <GTEST_5_TYPENAMES_(T)>
-struct tuple_size<GTEST_5_TUPLE_(T) > {
-  static const int value = 5;
-};
-
-template <GTEST_6_TYPENAMES_(T)>
-struct tuple_size<GTEST_6_TUPLE_(T) > {
-  static const int value = 6;
-};
-
-template <GTEST_7_TYPENAMES_(T)>
-struct tuple_size<GTEST_7_TUPLE_(T) > {
-  static const int value = 7;
-};
-
-template <GTEST_8_TYPENAMES_(T)>
-struct tuple_size<GTEST_8_TUPLE_(T) > {
-  static const int value = 8;
-};
-
-template <GTEST_9_TYPENAMES_(T)>
-struct tuple_size<GTEST_9_TUPLE_(T) > {
-  static const int value = 9;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct tuple_size<GTEST_10_TUPLE_(T) > {
-  static const int value = 10;
-};
-
-template <int k, class Tuple>
-struct tuple_element {
-  typedef typename gtest_internal::TupleElement<
-      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
-};
-
-#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
-
-// 6.1.3.4 Element access.
-
-namespace gtest_internal {
-
-template <>
-class Get<0> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
-  Field(Tuple& t) { return t.f0_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
-  ConstField(const Tuple& t) { return t.f0_; }
-};
-
-template <>
-class Get<1> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
-  Field(Tuple& t) { return t.f1_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
-  ConstField(const Tuple& t) { return t.f1_; }
-};
-
-template <>
-class Get<2> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
-  Field(Tuple& t) { return t.f2_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
-  ConstField(const Tuple& t) { return t.f2_; }
-};
-
-template <>
-class Get<3> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
-  Field(Tuple& t) { return t.f3_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
-  ConstField(const Tuple& t) { return t.f3_; }
-};
-
-template <>
-class Get<4> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
-  Field(Tuple& t) { return t.f4_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
-  ConstField(const Tuple& t) { return t.f4_; }
-};
-
-template <>
-class Get<5> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
-  Field(Tuple& t) { return t.f5_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
-  ConstField(const Tuple& t) { return t.f5_; }
-};
-
-template <>
-class Get<6> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
-  Field(Tuple& t) { return t.f6_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
-  ConstField(const Tuple& t) { return t.f6_; }
-};
-
-template <>
-class Get<7> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
-  Field(Tuple& t) { return t.f7_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
-  ConstField(const Tuple& t) { return t.f7_; }
-};
-
-template <>
-class Get<8> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
-  Field(Tuple& t) { return t.f8_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
-  ConstField(const Tuple& t) { return t.f8_; }
-};
-
-template <>
-class Get<9> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
-  Field(Tuple& t) { return t.f9_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
-  ConstField(const Tuple& t) { return t.f9_; }
-};
-
-}  // namespace gtest_internal
-
-template <int k, GTEST_10_TYPENAMES_(T)>
-GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T)))
-get(GTEST_10_TUPLE_(T)& t) {
-  return gtest_internal::Get<k>::Field(t);
-}
-
-template <int k, GTEST_10_TYPENAMES_(T)>
-GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_10_TUPLE_(T)))
-get(const GTEST_10_TUPLE_(T)& t) {
-  return gtest_internal::Get<k>::ConstField(t);
-}
-
-// 6.1.3.5 Relational operators
-
-// We only implement == and !=, as we don't have a need for the rest yet.
-
-namespace gtest_internal {
-
-// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
-// first k fields of t1 equals the first k fields of t2.
-// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
-// k1 != k2.
-template <int kSize1, int kSize2>
-struct SameSizeTuplePrefixComparator;
-
-template <>
-struct SameSizeTuplePrefixComparator<0, 0> {
-  template <class Tuple1, class Tuple2>
-  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
-    return true;
-  }
-};
-
-template <int k>
-struct SameSizeTuplePrefixComparator<k, k> {
-  template <class Tuple1, class Tuple2>
-  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
-    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
-        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
-  }
-};
-
-}  // namespace gtest_internal
-
-template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
-inline bool operator==(const GTEST_10_TUPLE_(T)& t,
-                       const GTEST_10_TUPLE_(U)& u) {
-  return gtest_internal::SameSizeTuplePrefixComparator<
-      tuple_size<GTEST_10_TUPLE_(T) >::value,
-      tuple_size<GTEST_10_TUPLE_(U) >::value>::Eq(t, u);
-}
-
-template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
-inline bool operator!=(const GTEST_10_TUPLE_(T)& t,
-                       const GTEST_10_TUPLE_(U)& u) { return !(t == u); }
-
-// 6.1.4 Pairs.
-// Unimplemented.
-
-}  // namespace tr1
-}  // namespace std
-
-#undef GTEST_0_TUPLE_
-#undef GTEST_1_TUPLE_
-#undef GTEST_2_TUPLE_
-#undef GTEST_3_TUPLE_
-#undef GTEST_4_TUPLE_
-#undef GTEST_5_TUPLE_
-#undef GTEST_6_TUPLE_
-#undef GTEST_7_TUPLE_
-#undef GTEST_8_TUPLE_
-#undef GTEST_9_TUPLE_
-#undef GTEST_10_TUPLE_
-
-#undef GTEST_0_TYPENAMES_
-#undef GTEST_1_TYPENAMES_
-#undef GTEST_2_TYPENAMES_
-#undef GTEST_3_TYPENAMES_
-#undef GTEST_4_TYPENAMES_
-#undef GTEST_5_TYPENAMES_
-#undef GTEST_6_TYPENAMES_
-#undef GTEST_7_TYPENAMES_
-#undef GTEST_8_TYPENAMES_
-#undef GTEST_9_TYPENAMES_
-#undef GTEST_10_TYPENAMES_
-
-#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
-#undef GTEST_BY_REF_
-#undef GTEST_ADD_REF_
-#undef GTEST_TUPLE_ELEMENT_
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-# elif GTEST_ENV_HAS_STD_TUPLE_
-#  include <tuple>
-// C++11 puts its tuple into the ::std namespace rather than
-// ::std::tr1.  gtest expects tuple to live in ::std::tr1, so put it there.
-// This causes undefined behavior, but supported compilers react in
-// the way we intend.
-namespace std {
-namespace tr1 {
-using ::std::get;
-using ::std::make_tuple;
-using ::std::tuple;
-using ::std::tuple_element;
-using ::std::tuple_size;
-}
-}
-
-# elif GTEST_OS_SYMBIAN
-
-// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
-// use STLport's tuple implementation, which unfortunately doesn't
-// work as the copy of STLport distributed with Symbian is incomplete.
-// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to
-// use its own tuple implementation.
-#  ifdef BOOST_HAS_TR1_TUPLE
-#   undef BOOST_HAS_TR1_TUPLE
-#  endif  // BOOST_HAS_TR1_TUPLE
-
-// This prevents <boost/tr1/detail/config.hpp>, which defines
-// BOOST_HAS_TR1_TUPLE, from being #included by Boost's <tuple>.
-#  define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED
-#  include <tuple>
-
-# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000)
-// GCC 4.0+ implements tr1/tuple in the <tr1/tuple> header.  This does
-// not conform to the TR1 spec, which requires the header to be <tuple>.
-
-#  if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
-// Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
-// which is #included by <tr1/tuple>, to not compile when RTTI is
-// disabled.  _TR1_FUNCTIONAL is the header guard for
-// <tr1/functional>.  Hence the following #define is a hack to prevent
-// <tr1/functional> from being included.
-#   define _TR1_FUNCTIONAL 1
-#   include <tr1/tuple>
-#   undef _TR1_FUNCTIONAL  // Allows the user to #include
-                        // <tr1/functional> if he chooses to.
-#  else
-#   include <tr1/tuple>  // NOLINT
-#  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
-
-# else
-// If the compiler is not GCC 4.0+, we assume the user is using a
-// spec-conforming TR1 implementation.
-#  include <tuple>  // NOLINT
-# endif  // GTEST_USE_OWN_TR1_TUPLE
-
-#endif  // GTEST_HAS_TR1_TUPLE
-
-// Determines whether clone(2) is supported.
-// Usually it will only be available on Linux, excluding
-// Linux on the Itanium architecture.
-// Also see http://linux.die.net/man/2/clone.
-#ifndef GTEST_HAS_CLONE
-// The user didn't tell us, so we need to figure it out.
-
-# if GTEST_OS_LINUX && !defined(__ia64__)
-#  if GTEST_OS_LINUX_ANDROID
-// On Android, clone() is only available on ARM starting with Gingerbread.
-#    if defined(__arm__) && __ANDROID_API__ >= 9
-#     define GTEST_HAS_CLONE 1
-#    else
-#     define GTEST_HAS_CLONE 0
-#    endif
-#  else
-#   define GTEST_HAS_CLONE 1
-#  endif
-# else
-#  define GTEST_HAS_CLONE 0
-# endif  // GTEST_OS_LINUX && !defined(__ia64__)
-
-#endif  // GTEST_HAS_CLONE
-
-// Determines whether to support stream redirection. This is used to test
-// output correctness and to implement death tests.
-#ifndef GTEST_HAS_STREAM_REDIRECTION
-// By default, we assume that stream redirection is supported on all
-// platforms except known mobile ones.
-# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN
-#  define GTEST_HAS_STREAM_REDIRECTION 0
-# else
-#  define GTEST_HAS_STREAM_REDIRECTION 1
-# endif  // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN
-#endif  // GTEST_HAS_STREAM_REDIRECTION
-
-// Determines whether to support death tests.
-// Google Test does not support death tests for VC 7.1 and earlier as
-// abort() in a VC 7.1 application compiled as GUI in debug config
-// pops up a dialog window that cannot be suppressed programmatically.
-#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
-     (GTEST_OS_MAC && !GTEST_OS_IOS) || GTEST_OS_IOS_SIMULATOR || \
-     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
-     GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
-     GTEST_OS_OPENBSD || GTEST_OS_QNX)
-# define GTEST_HAS_DEATH_TEST 1
-# include <vector>  // NOLINT
-#endif
-
-// We don't support MSVC 7.1 with exceptions disabled now.  Therefore
-// all the compilers we care about are adequate for supporting
-// value-parameterized tests.
-#define GTEST_HAS_PARAM_TEST 1
-
-// Determines whether to support type-driven tests.
-
-// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
-// Sun Pro CC, IBM Visual Age, and HP aCC support.
-#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \
-    defined(__IBMCPP__) || defined(__HP_aCC)
-# define GTEST_HAS_TYPED_TEST 1
-# define GTEST_HAS_TYPED_TEST_P 1
-#endif
-
-// Determines whether to support Combine(). This only makes sense when
-// value-parameterized tests are enabled.  The implementation doesn't
-// work on Sun Studio since it doesn't understand templated conversion
-// operators.
-#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC)
-# define GTEST_HAS_COMBINE 1
-#endif
-
-// Determines whether the system compiler uses UTF-16 for encoding wide strings.
-#define GTEST_WIDE_STRING_USES_UTF16_ \
-    (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX)
-
-// Determines whether test results can be streamed to a socket.
-#if GTEST_OS_LINUX
-# define GTEST_CAN_STREAM_RESULTS_ 1
-#endif
-
-// Defines some utility macros.
-
-// The GNU compiler emits a warning if nested "if" statements are followed by
-// an "else" statement and braces are not used to explicitly disambiguate the
-// "else" binding.  This leads to problems with code like:
-//
-//   if (gate)
-//     ASSERT_*(condition) << "Some message";
-//
-// The "switch (0) case 0:" idiom is used to suppress this.
-#ifdef __INTEL_COMPILER
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
-#else
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
-#endif
-
-// Use this annotation at the end of a struct/class definition to
-// prevent the compiler from optimizing away instances that are never
-// used.  This is useful when all interesting logic happens inside the
-// c'tor and / or d'tor.  Example:
-//
-//   struct Foo {
-//     Foo() { ... }
-//   } GTEST_ATTRIBUTE_UNUSED_;
-//
-// Also use it after a variable or parameter declaration to tell the
-// compiler the variable/parameter does not have to be used.
-#if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
-#else
-# define GTEST_ATTRIBUTE_UNUSED_
-#endif
-
-// A macro to disallow operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_ASSIGN_(type)\
-  void operator=(type const &)
-
-// A macro to disallow copy constructor and operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\
-  type(type const &);\
-  GTEST_DISALLOW_ASSIGN_(type)
-
-// Tell the compiler to warn about unused return values for functions declared
-// with this macro.  The macro should be used on function declarations
-// following the argument list:
-//
-//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
-#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC)
-# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
-#else
-# define GTEST_MUST_USE_RESULT_
-#endif  // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC
-
-// Determine whether the compiler supports Microsoft's Structured Exception
-// Handling.  This is supported by several Windows compilers but generally
-// does not exist on any other system.
-#ifndef GTEST_HAS_SEH
-// The user didn't tell us, so we need to figure it out.
-
-# if defined(_MSC_VER) || defined(__BORLANDC__)
-// These two compilers are known to support SEH.
-#  define GTEST_HAS_SEH 1
-# else
-// Assume no SEH.
-#  define GTEST_HAS_SEH 0
-# endif
-
-#endif  // GTEST_HAS_SEH
-
-#ifdef _MSC_VER
-
-# if GTEST_LINKED_AS_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllimport)
-# elif GTEST_CREATE_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllexport)
-# endif
-
-#endif  // _MSC_VER
-
-#ifndef GTEST_API_
-# define GTEST_API_
-#endif
-
-#ifdef __GNUC__
-// Ask the compiler to never inline a given function.
-# define GTEST_NO_INLINE_ __attribute__((noinline))
-#else
-# define GTEST_NO_INLINE_
-#endif
-
-// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
-#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
-# define GTEST_HAS_CXXABI_H_ 1
-#else
-# define GTEST_HAS_CXXABI_H_ 0
-#endif
-
-namespace testing {
-
-class Message;
-
-namespace internal {
-
-// A secret type that Google Test users don't know about.  It has no
-// definition on purpose.  Therefore it's impossible to create a
-// Secret object, which is what we want.
-class Secret;
-
-// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time
-// expression is true. For example, you could use it to verify the
-// size of a static array:
-//
-//   GTEST_COMPILE_ASSERT_(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES,
-//                         content_type_names_incorrect_size);
-//
-// or to make sure a struct is smaller than a certain size:
-//
-//   GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large);
-//
-// The second argument to the macro is the name of the variable. If
-// the expression is false, most compilers will issue a warning/error
-// containing the name of the variable.
-
-template <bool>
-struct CompileAssert {
-};
-
-#define GTEST_COMPILE_ASSERT_(expr, msg) \
-  typedef ::testing::internal::CompileAssert<(static_cast<bool>(expr))> \
-      msg[static_cast<bool>(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_
-
-// Implementation details of GTEST_COMPILE_ASSERT_:
-//
-// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1
-//   elements (and thus is invalid) when the expression is false.
-//
-// - The simpler definition
-//
-//    #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1]
-//
-//   does not work, as gcc supports variable-length arrays whose sizes
-//   are determined at run-time (this is gcc's extension and not part
-//   of the C++ standard).  As a result, gcc fails to reject the
-//   following code with the simple definition:
-//
-//     int foo;
-//     GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is
-//                                      // not a compile-time constant.
-//
-// - By using the type CompileAssert<(bool(expr))>, we ensures that
-//   expr is a compile-time constant.  (Template arguments must be
-//   determined at compile-time.)
-//
-// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
-//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
-//
-//     CompileAssert<bool(expr)>
-//
-//   instead, these compilers will refuse to compile
-//
-//     GTEST_COMPILE_ASSERT_(5 > 0, some_message);
-//
-//   (They seem to think the ">" in "5 > 0" marks the end of the
-//   template argument list.)
-//
-// - The array size is (bool(expr) ? 1 : -1), instead of simply
-//
-//     ((expr) ? 1 : -1).
-//
-//   This is to avoid running into a bug in MS VC 7.1, which
-//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
-
-// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h.
-//
-// This template is declared, but intentionally undefined.
-template <typename T1, typename T2>
-struct StaticAssertTypeEqHelper;
-
-template <typename T>
-struct StaticAssertTypeEqHelper<T, T> {};
-
-#if GTEST_HAS_GLOBAL_STRING
-typedef ::string string;
-#else
-typedef ::std::string string;
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-#if GTEST_HAS_GLOBAL_WSTRING
-typedef ::wstring wstring;
-#elif GTEST_HAS_STD_WSTRING
-typedef ::std::wstring wstring;
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-// A helper for suppressing warnings on constant condition.  It just
-// returns 'condition'.
-GTEST_API_ bool IsTrue(bool condition);
-
-// Defines scoped_ptr.
-
-// This implementation of scoped_ptr is PARTIAL - it only contains
-// enough stuff to satisfy Google Test's need.
-template <typename T>
-class scoped_ptr {
- public:
-  typedef T element_type;
-
-  explicit scoped_ptr(T* p = NULL) : ptr_(p) {}
-  ~scoped_ptr() { reset(); }
-
-  T& operator*() const { return *ptr_; }
-  T* operator->() const { return ptr_; }
-  T* get() const { return ptr_; }
-
-  T* release() {
-    T* const ptr = ptr_;
-    ptr_ = NULL;
-    return ptr;
-  }
-
-  void reset(T* p = NULL) {
-    if (p != ptr_) {
-      if (IsTrue(sizeof(T) > 0)) {  // Makes sure T is a complete type.
-        delete ptr_;
-      }
-      ptr_ = p;
-    }
-  }
-
- private:
-  T* ptr_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr);
-};
-
-// Defines RE.
-
-// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
-// Regular Expression syntax.
-class GTEST_API_ RE {
- public:
-  // A copy constructor is required by the Standard to initialize object
-  // references from r-values.
-  RE(const RE& other) { Init(other.pattern()); }
-
-  // Constructs an RE from a string.
-  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
-
-#if GTEST_HAS_GLOBAL_STRING
-
-  RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
-
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-  RE(const char* regex) { Init(regex); }  // NOLINT
-  ~RE();
-
-  // Returns the string representation of the regex.
-  const char* pattern() const { return pattern_; }
-
-  // FullMatch(str, re) returns true iff regular expression re matches
-  // the entire str.
-  // PartialMatch(str, re) returns true iff regular expression re
-  // matches a substring of str (including str itself).
-  //
-  // TODO(wan@google.com): make FullMatch() and PartialMatch() work
-  // when str contains NUL characters.
-  static bool FullMatch(const ::std::string& str, const RE& re) {
-    return FullMatch(str.c_str(), re);
-  }
-  static bool PartialMatch(const ::std::string& str, const RE& re) {
-    return PartialMatch(str.c_str(), re);
-  }
-
-#if GTEST_HAS_GLOBAL_STRING
-
-  static bool FullMatch(const ::string& str, const RE& re) {
-    return FullMatch(str.c_str(), re);
-  }
-  static bool PartialMatch(const ::string& str, const RE& re) {
-    return PartialMatch(str.c_str(), re);
-  }
-
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-  static bool FullMatch(const char* str, const RE& re);
-  static bool PartialMatch(const char* str, const RE& re);
-
- private:
-  void Init(const char* regex);
-
-  // We use a const char* instead of an std::string, as Google Test used to be
-  // used where std::string is not available.  TODO(wan@google.com): change to
-  // std::string.
-  const char* pattern_;
-  bool is_valid_;
-
-#if GTEST_USES_POSIX_RE
-
-  regex_t full_regex_;     // For FullMatch().
-  regex_t partial_regex_;  // For PartialMatch().
-
-#else  // GTEST_USES_SIMPLE_RE
-
-  const char* full_pattern_;  // For FullMatch();
-
-#endif
-
-  GTEST_DISALLOW_ASSIGN_(RE);
-};
-
-// Formats a source file path and a line number as they would appear
-// in an error message from the compiler used to compile this code.
-GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
-
-// Formats a file location for compiler-independent XML output.
-// Although this function is not platform dependent, we put it next to
-// FormatFileLocation in order to contrast the two functions.
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
-                                                               int line);
-
-// Defines logging utilities:
-//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
-//                          message itself is streamed into the macro.
-//   LogToStderr()  - directs all log messages to stderr.
-//   FlushInfoLog() - flushes informational log messages.
-
-enum GTestLogSeverity {
-  GTEST_INFO,
-  GTEST_WARNING,
-  GTEST_ERROR,
-  GTEST_FATAL
-};
-
-// Formats log entry severity, provides a stream object for streaming the
-// log message, and terminates the message with a newline when going out of
-// scope.
-class GTEST_API_ GTestLog {
- public:
-  GTestLog(GTestLogSeverity severity, const char* file, int line);
-
-  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
-  ~GTestLog();
-
-  ::std::ostream& GetStream() { return ::std::cerr; }
-
- private:
-  const GTestLogSeverity severity_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
-};
-
-#define GTEST_LOG_(severity) \
-    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
-                                  __FILE__, __LINE__).GetStream()
-
-inline void LogToStderr() {}
-inline void FlushInfoLog() { fflush(NULL); }
-
-// INTERNAL IMPLEMENTATION - DO NOT USE.
-//
-// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
-// is not satisfied.
-//  Synopsys:
-//    GTEST_CHECK_(boolean_condition);
-//     or
-//    GTEST_CHECK_(boolean_condition) << "Additional message";
-//
-//    This checks the condition and if the condition is not satisfied
-//    it prints message about the condition violation, including the
-//    condition itself, plus additional message streamed into it, if any,
-//    and then it aborts the program. It aborts the program irrespective of
-//    whether it is built in the debug mode or not.
-#define GTEST_CHECK_(condition) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::IsTrue(condition)) \
-      ; \
-    else \
-      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
-
-// An all-mode assert to verify that the given POSIX-style function
-// call returns 0 (indicating success).  Known limitation: this
-// doesn't expand to a balanced 'if' statement, so enclose the macro
-// in {} if you need to use it as the only statement in an 'if'
-// branch.
-#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
-  if (const int gtest_error = (posix_call)) \
-    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
-                      << gtest_error
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Use ImplicitCast_ as a safe version of static_cast for upcasting in
-// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
-// const Foo*).  When you use ImplicitCast_, the compiler checks that
-// the cast is safe.  Such explicit ImplicitCast_s are necessary in
-// surprisingly many situations where C++ demands an exact type match
-// instead of an argument type convertable to a target type.
-//
-// The syntax for using ImplicitCast_ is the same as for static_cast:
-//
-//   ImplicitCast_<ToType>(expr)
-//
-// ImplicitCast_ would have been part of the C++ standard library,
-// but the proposal was submitted too late.  It will probably make
-// its way into the language in the future.
-//
-// This relatively ugly name is intentional. It prevents clashes with
-// similar functions users may have (e.g., implicit_cast). The internal
-// namespace alone is not enough because the function can be found by ADL.
-template<typename To>
-inline To ImplicitCast_(To x) { return x; }
-
-// When you upcast (that is, cast a pointer from type Foo to type
-// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
-// always succeed.  When you downcast (that is, cast a pointer from
-// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
-// how do you know the pointer is really of type SubclassOfFoo?  It
-// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
-// when you downcast, you should use this macro.  In debug mode, we
-// use dynamic_cast<> to double-check the downcast is legal (we die
-// if it's not).  In normal mode, we do the efficient static_cast<>
-// instead.  Thus, it's important to test in debug mode to make sure
-// the cast is legal!
-//    This is the only place in the code we should use dynamic_cast<>.
-// In particular, you SHOULDN'T be using dynamic_cast<> in order to
-// do RTTI (eg code like this:
-//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
-//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
-// You should design the code some other way not to need this.
-//
-// This relatively ugly name is intentional. It prevents clashes with
-// similar functions users may have (e.g., down_cast). The internal
-// namespace alone is not enough because the function can be found by ADL.
-template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
-inline To DownCast_(From* f) {  // so we only accept pointers
-  // Ensures that To is a sub-type of From *.  This test is here only
-  // for compile-time type checking, and has no overhead in an
-  // optimized build at run-time, as it will be optimized away
-  // completely.
-  if (false) {
-    const To to = NULL;
-    ::testing::internal::ImplicitCast_<From*>(to);
-  }
-
-#if GTEST_HAS_RTTI
-  // RTTI: debug mode only!
-  GTEST_CHECK_(f == NULL || dynamic_cast<To>(f) != NULL);
-#endif
-  return static_cast<To>(f);
-}
-
-// Downcasts the pointer of type Base to Derived.
-// Derived must be a subclass of Base. The parameter MUST
-// point to a class of type Derived, not any subclass of it.
-// When RTTI is available, the function performs a runtime
-// check to enforce this.
-template <class Derived, class Base>
-Derived* CheckedDowncastToActualType(Base* base) {
-#if GTEST_HAS_RTTI
-  GTEST_CHECK_(typeid(*base) == typeid(Derived));
-  return dynamic_cast<Derived*>(base);  // NOLINT
-#else
-  return static_cast<Derived*>(base);  // Poor man's downcast.
-#endif
-}
-
-#if GTEST_HAS_STREAM_REDIRECTION
-
-// Defines the stderr capturer:
-//   CaptureStdout     - starts capturing stdout.
-//   GetCapturedStdout - stops capturing stdout and returns the captured string.
-//   CaptureStderr     - starts capturing stderr.
-//   GetCapturedStderr - stops capturing stderr and returns the captured string.
-//
-GTEST_API_ void CaptureStdout();
-GTEST_API_ std::string GetCapturedStdout();
-GTEST_API_ void CaptureStderr();
-GTEST_API_ std::string GetCapturedStderr();
-
-#endif  // GTEST_HAS_STREAM_REDIRECTION
-
-
-#if GTEST_HAS_DEATH_TEST
-
-const ::std::vector<testing::internal::string>& GetInjectableArgvs();
-void SetInjectableArgvs(const ::std::vector<testing::internal::string>*
-                             new_argvs);
-
-// A copy of all command line arguments.  Set by InitGoogleTest().
-extern ::std::vector<testing::internal::string> g_argvs;
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-// Defines synchronization primitives.
-
-#if GTEST_HAS_PTHREAD
-
-// Sleeps for (roughly) n milli-seconds.  This function is only for
-// testing Google Test's own constructs.  Don't use it in user tests,
-// either directly or indirectly.
-inline void SleepMilliseconds(int n) {
-  const timespec time = {
-    0,                  // 0 seconds.
-    n * 1000L * 1000L,  // And n ms.
-  };
-  nanosleep(&time, NULL);
-}
-
-// Allows a controller thread to pause execution of newly created
-// threads until notified.  Instances of this class must be created
-// and destroyed in the controller thread.
-//
-// This class is only for testing Google Test's own constructs. Do not
-// use it in user tests, either directly or indirectly.
-class Notification {
- public:
-  Notification() : notified_(false) {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
-  }
-  ~Notification() {
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  // Notifies all threads created with this notification to start. Must
-  // be called from the controller thread.
-  void Notify() {
-    pthread_mutex_lock(&mutex_);
-    notified_ = true;
-    pthread_mutex_unlock(&mutex_);
-  }
-
-  // Blocks until the controller thread notifies. Must be called from a test
-  // thread.
-  void WaitForNotification() {
-    for (;;) {
-      pthread_mutex_lock(&mutex_);
-      const bool notified = notified_;
-      pthread_mutex_unlock(&mutex_);
-      if (notified)
-        break;
-      SleepMilliseconds(10);
-    }
-  }
-
- private:
-  pthread_mutex_t mutex_;
-  bool notified_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
-};
-
-// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
-// Consequently, it cannot select a correct instantiation of ThreadWithParam
-// in order to call its Run(). Introducing ThreadWithParamBase as a
-// non-templated base class for ThreadWithParam allows us to bypass this
-// problem.
-class ThreadWithParamBase {
- public:
-  virtual ~ThreadWithParamBase() {}
-  virtual void Run() = 0;
-};
-
-// pthread_create() accepts a pointer to a function type with the C linkage.
-// According to the Standard (7.5/1), function types with different linkages
-// are different even if they are otherwise identical.  Some compilers (for
-// example, SunStudio) treat them as different types.  Since class methods
-// cannot be defined with C-linkage we need to define a free C-function to
-// pass into pthread_create().
-extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
-  static_cast<ThreadWithParamBase*>(thread)->Run();
-  return NULL;
-}
-
-// Helper class for testing Google Test's multi-threading constructs.
-// To use it, write:
-//
-//   void ThreadFunc(int param) { /* Do things with param */ }
-//   Notification thread_can_start;
-//   ...
-//   // The thread_can_start parameter is optional; you can supply NULL.
-//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
-//   thread_can_start.Notify();
-//
-// These classes are only for testing Google Test's own constructs. Do
-// not use them in user tests, either directly or indirectly.
-template <typename T>
-class ThreadWithParam : public ThreadWithParamBase {
- public:
-  typedef void (*UserThreadFunc)(T);
-
-  ThreadWithParam(
-      UserThreadFunc func, T param, Notification* thread_can_start)
-      : func_(func),
-        param_(param),
-        thread_can_start_(thread_can_start),
-        finished_(false) {
-    ThreadWithParamBase* const base = this;
-    // The thread can be created only after all fields except thread_
-    // have been initialized.
-    GTEST_CHECK_POSIX_SUCCESS_(
-        pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base));
-  }
-  ~ThreadWithParam() { Join(); }
-
-  void Join() {
-    if (!finished_) {
-      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0));
-      finished_ = true;
-    }
-  }
-
-  virtual void Run() {
-    if (thread_can_start_ != NULL)
-      thread_can_start_->WaitForNotification();
-    func_(param_);
-  }
-
- private:
-  const UserThreadFunc func_;  // User-supplied thread function.
-  const T param_;  // User-supplied parameter to the thread function.
-  // When non-NULL, used to block execution until the controller thread
-  // notifies.
-  Notification* const thread_can_start_;
-  bool finished_;  // true iff we know that the thread function has finished.
-  pthread_t thread_;  // The native thread object.
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
-};
-
-// MutexBase and Mutex implement mutex on pthreads-based platforms. They
-// are used in conjunction with class MutexLock:
-//
-//   Mutex mutex;
-//   ...
-//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the end
-//                            // of the current scope.
-//
-// MutexBase implements behavior for both statically and dynamically
-// allocated mutexes.  Do not use MutexBase directly.  Instead, write
-// the following to define a static mutex:
-//
-//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
-//
-// You can forward declare a static mutex like this:
-//
-//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
-//
-// To create a dynamic mutex, just define an object of type Mutex.
-class MutexBase {
- public:
-  // Acquires this mutex.
-  void Lock() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
-    owner_ = pthread_self();
-    has_owner_ = true;
-  }
-
-  // Releases this mutex.
-  void Unlock() {
-    // Since the lock is being released the owner_ field should no longer be
-    // considered valid. We don't protect writing to has_owner_ here, as it's
-    // the caller's responsibility to ensure that the current thread holds the
-    // mutex when this is called.
-    has_owner_ = false;
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
-  }
-
-  // Does nothing if the current thread holds the mutex. Otherwise, crashes
-  // with high probability.
-  void AssertHeld() const {
-    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
-        << "The current thread is not holding the mutex @" << this;
-  }
-
-  // A static mutex may be used before main() is entered.  It may even
-  // be used before the dynamic initialization stage.  Therefore we
-  // must be able to initialize a static mutex object at link time.
-  // This means MutexBase has to be a POD and its member variables
-  // have to be public.
- public:
-  pthread_mutex_t mutex_;  // The underlying pthread mutex.
-  // has_owner_ indicates whether the owner_ field below contains a valid thread
-  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
-  // accesses to the owner_ field should be protected by a check of this field.
-  // An alternative might be to memset() owner_ to all zeros, but there's no
-  // guarantee that a zero'd pthread_t is necessarily invalid or even different
-  // from pthread_self().
-  bool has_owner_;
-  pthread_t owner_;  // The thread holding the mutex.
-};
-
-// Forward-declares a static mutex.
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-    extern ::testing::internal::MutexBase mutex
-
-// Defines and statically (i.e. at link time) initializes a static mutex.
-// The initialization list here does not explicitly initialize each field,
-// instead relying on default initialization for the unspecified fields. In
-// particular, the owner_ field (a pthread_t) is not explicitly initialized.
-// This allows initialization to work whether pthread_t is a scalar or struct.
-// The flag -Wmissing-field-initializers must not be specified for this to work.
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-    ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false }
-
-// The Mutex class can only be used for mutexes created at runtime. It
-// shares its API with MutexBase otherwise.
-class Mutex : public MutexBase {
- public:
-  Mutex() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
-    has_owner_ = false;
-  }
-  ~Mutex() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
-  }
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
-};
-
-// We cannot name this class MutexLock as the ctor declaration would
-// conflict with a macro named MutexLock, which is defined on some
-// platforms.  Hence the typedef trick below.
-class GTestMutexLock {
- public:
-  explicit GTestMutexLock(MutexBase* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
-
-  ~GTestMutexLock() { mutex_->Unlock(); }
-
- private:
-  MutexBase* const mutex_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
-};
-
-typedef GTestMutexLock MutexLock;
-
-// Helpers for ThreadLocal.
-
-// pthread_key_create() requires DeleteThreadLocalValue() to have
-// C-linkage.  Therefore it cannot be templatized to access
-// ThreadLocal<T>.  Hence the need for class
-// ThreadLocalValueHolderBase.
-class ThreadLocalValueHolderBase {
- public:
-  virtual ~ThreadLocalValueHolderBase() {}
-};
-
-// Called by pthread to delete thread-local data stored by
-// pthread_setspecific().
-extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
-  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
-}
-
-// Implements thread-local storage on pthreads-based systems.
-//
-//   // Thread 1
-//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
-//
-//   // Thread 2
-//   tl.set(150);  // Changes the value for thread 2 only.
-//   EXPECT_EQ(150, tl.get());
-//
-//   // Thread 1
-//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
-//   tl.set(200);
-//   EXPECT_EQ(200, tl.get());
-//
-// The template type argument T must have a public copy constructor.
-// In addition, the default ThreadLocal constructor requires T to have
-// a public default constructor.
-//
-// An object managed for a thread by a ThreadLocal instance is deleted
-// when the thread exits.  Or, if the ThreadLocal instance dies in
-// that thread, when the ThreadLocal dies.  It's the user's
-// responsibility to ensure that all other threads using a ThreadLocal
-// have exited when it dies, or the per-thread objects for those
-// threads will not be deleted.
-//
-// Google Test only uses global ThreadLocal objects.  That means they
-// will die after main() has returned.  Therefore, no per-thread
-// object managed by Google Test will be leaked as long as all threads
-// using Google Test have exited when main() returns.
-template <typename T>
-class ThreadLocal {
- public:
-  ThreadLocal() : key_(CreateKey()),
-                  default_() {}
-  explicit ThreadLocal(const T& value) : key_(CreateKey()),
-                                         default_(value) {}
-
-  ~ThreadLocal() {
-    // Destroys the managed object for the current thread, if any.
-    DeleteThreadLocalValue(pthread_getspecific(key_));
-
-    // Releases resources associated with the key.  This will *not*
-    // delete managed objects for other threads.
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
-  }
-
-  T* pointer() { return GetOrCreateValue(); }
-  const T* pointer() const { return GetOrCreateValue(); }
-  const T& get() const { return *pointer(); }
-  void set(const T& value) { *pointer() = value; }
-
- private:
-  // Holds a value of type T.
-  class ValueHolder : public ThreadLocalValueHolderBase {
-   public:
-    explicit ValueHolder(const T& value) : value_(value) {}
-
-    T* pointer() { return &value_; }
-
-   private:
-    T value_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
-  };
-
-  static pthread_key_t CreateKey() {
-    pthread_key_t key;
-    // When a thread exits, DeleteThreadLocalValue() will be called on
-    // the object managed for that thread.
-    GTEST_CHECK_POSIX_SUCCESS_(
-        pthread_key_create(&key, &DeleteThreadLocalValue));
-    return key;
-  }
-
-  T* GetOrCreateValue() const {
-    ThreadLocalValueHolderBase* const holder =
-        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
-    if (holder != NULL) {
-      return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
-    }
-
-    ValueHolder* const new_holder = new ValueHolder(default_);
-    ThreadLocalValueHolderBase* const holder_base = new_holder;
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
-    return new_holder->pointer();
-  }
-
-  // A key pthreads uses for looking up per-thread values.
-  const pthread_key_t key_;
-  const T default_;  // The default value for each thread.
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
-};
-
-# define GTEST_IS_THREADSAFE 1
-
-#else  // GTEST_HAS_PTHREAD
-
-// A dummy implementation of synchronization primitives (mutex, lock,
-// and thread-local variable).  Necessary for compiling Google Test where
-// mutex is not supported - using Google Test in multiple threads is not
-// supported on such platforms.
-
-class Mutex {
- public:
-  Mutex() {}
-  void Lock() {}
-  void Unlock() {}
-  void AssertHeld() const {}
-};
-
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-  extern ::testing::internal::Mutex mutex
-
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
-
-class GTestMutexLock {
- public:
-  explicit GTestMutexLock(Mutex*) {}  // NOLINT
-};
-
-typedef GTestMutexLock MutexLock;
-
-template <typename T>
-class ThreadLocal {
- public:
-  ThreadLocal() : value_() {}
-  explicit ThreadLocal(const T& value) : value_(value) {}
-  T* pointer() { return &value_; }
-  const T* pointer() const { return &value_; }
-  const T& get() const { return value_; }
-  void set(const T& value) { value_ = value; }
- private:
-  T value_;
-};
-
-// The above synchronization primitives have dummy implementations.
-// Therefore Google Test is not thread-safe.
-# define GTEST_IS_THREADSAFE 0
-
-#endif  // GTEST_HAS_PTHREAD
-
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
-GTEST_API_ size_t GetThreadCount();
-
-// Passing non-POD classes through ellipsis (...) crashes the ARM
-// compiler and generates a warning in Sun Studio.  The Nokia Symbian
-// and the IBM XL C/C++ compiler try to instantiate a copy constructor
-// for objects passed through ellipsis (...), failing for uncopyable
-// objects.  We define this to ensure that only POD is passed through
-// ellipsis on these systems.
-#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC)
-// We lose support for NULL detection where the compiler doesn't like
-// passing non-POD classes through ellipsis (...).
-# define GTEST_ELLIPSIS_NEEDS_POD_ 1
-#else
-# define GTEST_CAN_COMPARE_NULL 1
-#endif
-
-// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between
-// const T& and const T* in a function template.  These compilers
-// _can_ decide between class template specializations for T and T*,
-// so a tr1::type_traits-like is_pointer works.
-#if defined(__SYMBIAN32__) || defined(__IBMCPP__)
-# define GTEST_NEEDS_IS_POINTER_ 1
-#endif
-
-template <bool bool_value>
-struct bool_constant {
-  typedef bool_constant<bool_value> type;
-  static const bool value = bool_value;
-};
-template <bool bool_value> const bool bool_constant<bool_value>::value;
-
-typedef bool_constant<false> false_type;
-typedef bool_constant<true> true_type;
-
-template <typename T>
-struct is_pointer : public false_type {};
-
-template <typename T>
-struct is_pointer<T*> : public true_type {};
-
-template <typename Iterator>
-struct IteratorTraits {
-  typedef typename Iterator::value_type value_type;
-};
-
-template <typename T>
-struct IteratorTraits<T*> {
-  typedef T value_type;
-};
-
-template <typename T>
-struct IteratorTraits<const T*> {
-  typedef T value_type;
-};
-
-#if GTEST_OS_WINDOWS
-# define GTEST_PATH_SEP_ "\\"
-# define GTEST_HAS_ALT_PATH_SEP_ 1
-// The biggest signed integer type the compiler supports.
-typedef __int64 BiggestInt;
-#else
-# define GTEST_PATH_SEP_ "/"
-# define GTEST_HAS_ALT_PATH_SEP_ 0
-typedef long long BiggestInt;  // NOLINT
-#endif  // GTEST_OS_WINDOWS
-
-// Utilities for char.
-
-// isspace(int ch) and friends accept an unsigned char or EOF.  char
-// may be signed, depending on the compiler (or compiler flags).
-// Therefore we need to cast a char to unsigned char before calling
-// isspace(), etc.
-
-inline bool IsAlpha(char ch) {
-  return isalpha(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsAlNum(char ch) {
-  return isalnum(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsDigit(char ch) {
-  return isdigit(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsLower(char ch) {
-  return islower(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsSpace(char ch) {
-  return isspace(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsUpper(char ch) {
-  return isupper(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsXDigit(char ch) {
-  return isxdigit(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsXDigit(wchar_t ch) {
-  const unsigned char low_byte = static_cast<unsigned char>(ch);
-  return ch == low_byte && isxdigit(low_byte) != 0;
-}
-
-inline char ToLower(char ch) {
-  return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
-}
-inline char ToUpper(char ch) {
-  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
-}
-
-// The testing::internal::posix namespace holds wrappers for common
-// POSIX functions.  These wrappers hide the differences between
-// Windows/MSVC and POSIX systems.  Since some compilers define these
-// standard functions as macros, the wrapper cannot have the same name
-// as the wrapped function.
-
-namespace posix {
-
-// Functions with a different name on Windows.
-
-#if GTEST_OS_WINDOWS
-
-typedef struct _stat StatStruct;
-
-# ifdef __BORLANDC__
-inline int IsATTY(int fd) { return isatty(fd); }
-inline int StrCaseCmp(const char* s1, const char* s2) {
-  return stricmp(s1, s2);
-}
-inline char* StrDup(const char* src) { return strdup(src); }
-# else  // !__BORLANDC__
-#  if GTEST_OS_WINDOWS_MOBILE
-inline int IsATTY(int /* fd */) { return 0; }
-#  else
-inline int IsATTY(int fd) { return _isatty(fd); }
-#  endif  // GTEST_OS_WINDOWS_MOBILE
-inline int StrCaseCmp(const char* s1, const char* s2) {
-  return _stricmp(s1, s2);
-}
-inline char* StrDup(const char* src) { return _strdup(src); }
-# endif  // __BORLANDC__
-
-# if GTEST_OS_WINDOWS_MOBILE
-inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
-// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
-// time and thus not defined there.
-# else
-inline int FileNo(FILE* file) { return _fileno(file); }
-inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
-inline int RmDir(const char* dir) { return _rmdir(dir); }
-inline bool IsDir(const StatStruct& st) {
-  return (_S_IFDIR & st.st_mode) != 0;
-}
-# endif  // GTEST_OS_WINDOWS_MOBILE
-
-#else
-
-typedef struct stat StatStruct;
-
-inline int FileNo(FILE* file) { return fileno(file); }
-inline int IsATTY(int fd) { return isatty(fd); }
-inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
-inline int StrCaseCmp(const char* s1, const char* s2) {
-  return strcasecmp(s1, s2);
-}
-inline char* StrDup(const char* src) { return strdup(src); }
-inline int RmDir(const char* dir) { return rmdir(dir); }
-inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
-
-#endif  // GTEST_OS_WINDOWS
-
-// Functions deprecated by MSVC 8.0.
-
-#ifdef _MSC_VER
-// Temporarily disable warning 4996 (deprecated function).
-# pragma warning(push)
-# pragma warning(disable:4996)
-#endif
-
-inline const char* StrNCpy(char* dest, const char* src, size_t n) {
-  return strncpy(dest, src, n);
-}
-
-// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
-// StrError() aren't needed on Windows CE at this time and thus not
-// defined there.
-
-#if !GTEST_OS_WINDOWS_MOBILE
-inline int ChDir(const char* dir) { return chdir(dir); }
-#endif
-inline FILE* FOpen(const char* path, const char* mode) {
-  return fopen(path, mode);
-}
-#if !GTEST_OS_WINDOWS_MOBILE
-inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
-  return freopen(path, mode, stream);
-}
-inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
-#endif
-inline int FClose(FILE* fp) { return fclose(fp); }
-#if !GTEST_OS_WINDOWS_MOBILE
-inline int Read(int fd, void* buf, unsigned int count) {
-  return static_cast<int>(read(fd, buf, count));
-}
-inline int Write(int fd, const void* buf, unsigned int count) {
-  return static_cast<int>(write(fd, buf, count));
-}
-inline int Close(int fd) { return close(fd); }
-inline const char* StrError(int errnum) { return strerror(errnum); }
-#endif
-inline const char* GetEnv(const char* name) {
-#if GTEST_OS_WINDOWS_MOBILE
-  // We are on Windows CE, which has no environment variables.
-  return NULL;
-#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
-  // Environment variables which we programmatically clear will be set to the
-  // empty string rather than unset (NULL).  Handle that case.
-  const char* const env = getenv(name);
-  return (env != NULL && env[0] != '\0') ? env : NULL;
-#else
-  return getenv(name);
-#endif
-}
-
-#ifdef _MSC_VER
-# pragma warning(pop)  // Restores the warning state.
-#endif
-
-#if GTEST_OS_WINDOWS_MOBILE
-// Windows CE has no C library. The abort() function is used in
-// several places in Google Test. This implementation provides a reasonable
-// imitation of standard behaviour.
-void Abort();
-#else
-inline void Abort() { abort(); }
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-}  // namespace posix
-
-// MSVC "deprecates" snprintf and issues warnings wherever it is used.  In
-// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
-// MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
-// function in order to achieve that.  We use macro definition here because
-// snprintf is a variadic function.
-#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
-// MSVC 2005 and above support variadic macros.
-# define GTEST_SNPRINTF_(buffer, size, format, ...) \
-     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
-#elif defined(_MSC_VER)
-// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't
-// complain about _snprintf.
-# define GTEST_SNPRINTF_ _snprintf
-#else
-# define GTEST_SNPRINTF_ snprintf
-#endif
-
-// The maximum number a BiggestInt can represent.  This definition
-// works no matter BiggestInt is represented in one's complement or
-// two's complement.
-//
-// We cannot rely on numeric_limits in STL, as __int64 and long long
-// are not part of standard C++ and numeric_limits doesn't need to be
-// defined for them.
-const BiggestInt kMaxBiggestInt =
-    ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1));
-
-// This template class serves as a compile-time function from size to
-// type.  It maps a size in bytes to a primitive type with that
-// size. e.g.
-//
-//   TypeWithSize<4>::UInt
-//
-// is typedef-ed to be unsigned int (unsigned integer made up of 4
-// bytes).
-//
-// Such functionality should belong to STL, but I cannot find it
-// there.
-//
-// Google Test uses this class in the implementation of floating-point
-// comparison.
-//
-// For now it only handles UInt (unsigned int) as that's all Google Test
-// needs.  Other types can be easily added in the future if need
-// arises.
-template <size_t size>
-class TypeWithSize {
- public:
-  // This prevents the user from using TypeWithSize<N> with incorrect
-  // values of N.
-  typedef void UInt;
-};
-
-// The specialization for size 4.
-template <>
-class TypeWithSize<4> {
- public:
-  // unsigned int has size 4 in both gcc and MSVC.
-  //
-  // As base/basictypes.h doesn't compile on Windows, we cannot use
-  // uint32, uint64, and etc here.
-  typedef int Int;
-  typedef unsigned int UInt;
-};
-
-// The specialization for size 8.
-template <>
-class TypeWithSize<8> {
- public:
-#if GTEST_OS_WINDOWS
-  typedef __int64 Int;
-  typedef unsigned __int64 UInt;
-#else
-  typedef long long Int;  // NOLINT
-  typedef unsigned long long UInt;  // NOLINT
-#endif  // GTEST_OS_WINDOWS
-};
-
-// Integer types of known sizes.
-typedef TypeWithSize<4>::Int Int32;
-typedef TypeWithSize<4>::UInt UInt32;
-typedef TypeWithSize<8>::Int Int64;
-typedef TypeWithSize<8>::UInt UInt64;
-typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
-
-// Utilities for command line flags and environment variables.
-
-// Macro for referencing flags.
-#define GTEST_FLAG(name) FLAGS_gtest_##name
-
-// Macros for declaring flags.
-#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
-#define GTEST_DECLARE_int32_(name) \
-    GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
-#define GTEST_DECLARE_string_(name) \
-    GTEST_API_ extern ::std::string GTEST_FLAG(name)
-
-// Macros for defining flags.
-#define GTEST_DEFINE_bool_(name, default_val, doc) \
-    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
-#define GTEST_DEFINE_int32_(name, default_val, doc) \
-    GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
-#define GTEST_DEFINE_string_(name, default_val, doc) \
-    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
-
-// Thread annotations
-#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
-#define GTEST_LOCK_EXCLUDED_(locks)
-
-// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
-// to *value and returns true; otherwise leaves *value unchanged and returns
-// false.
-// TODO(chandlerc): Find a better way to refactor flag and environment parsing
-// out of both gtest-port.cc and gtest.cc to avoid exporting this utility
-// function.
-bool ParseInt32(const Message& src_text, const char* str, Int32* value);
-
-// Parses a bool/Int32/string from the environment variable
-// corresponding to the given Google Test flag.
-bool BoolFromGTestEnv(const char* flag, bool default_val);
-GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
-const char* StringFromGTestEnv(const char* flag, const char* default_val);
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-
-#if GTEST_OS_LINUX
-# include <stdlib.h>
-# include <sys/types.h>
-# include <sys/wait.h>
-# include <unistd.h>
-#endif  // GTEST_OS_LINUX
-
-#if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
-#endif
-
-#include <ctype.h>
-#include <float.h>
-#include <string.h>
-#include <iomanip>
-#include <limits>
-#include <set>
-
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file defines the Message class.
-//
-// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
-// leave some internal implementation details in this header file.
-// They are clearly marked by comments like this:
-//
-//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-//
-// Such code is NOT meant to be used by a user directly, and is subject
-// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
-// program!
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-
-#include <limits>
-
-
-// Ensures that there is at least one operator<< in the global namespace.
-// See Message& operator<<(...) below for why.
-void operator<<(const testing::internal::Secret&, int);
-
-namespace testing {
-
-// The Message class works like an ostream repeater.
-//
-// Typical usage:
-//
-//   1. You stream a bunch of values to a Message object.
-//      It will remember the text in a stringstream.
-//   2. Then you stream the Message object to an ostream.
-//      This causes the text in the Message to be streamed
-//      to the ostream.
-//
-// For example;
-//
-//   testing::Message foo;
-//   foo << 1 << " != " << 2;
-//   std::cout << foo;
-//
-// will print "1 != 2".
-//
-// Message is not intended to be inherited from.  In particular, its
-// destructor is not virtual.
-//
-// Note that stringstream behaves differently in gcc and in MSVC.  You
-// can stream a NULL char pointer to it in the former, but not in the
-// latter (it causes an access violation if you do).  The Message
-// class hides this difference by treating a NULL char pointer as
-// "(null)".
-class GTEST_API_ Message {
- private:
-  // The type of basic IO manipulators (endl, ends, and flush) for
-  // narrow streams.
-  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
-
- public:
-  // Constructs an empty Message.
-  Message();
-
-  // Copy constructor.
-  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
-    *ss_ << msg.GetString();
-  }
-
-  // Constructs a Message from a C-string.
-  explicit Message(const char* str) : ss_(new ::std::stringstream) {
-    *ss_ << str;
-  }
-
-#if GTEST_OS_SYMBIAN
-  // Streams a value (either a pointer or not) to this object.
-  template <typename T>
-  inline Message& operator <<(const T& value) {
-    StreamHelper(typename internal::is_pointer<T>::type(), value);
-    return *this;
-  }
-#else
-  // Streams a non-pointer value to this object.
-  template <typename T>
-  inline Message& operator <<(const T& val) {
-    // Some libraries overload << for STL containers.  These
-    // overloads are defined in the global namespace instead of ::std.
-    //
-    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
-    // overloads are visible in either the std namespace or the global
-    // namespace, but not other namespaces, including the testing
-    // namespace which Google Test's Message class is in.
-    //
-    // To allow STL containers (and other types that has a << operator
-    // defined in the global namespace) to be used in Google Test
-    // assertions, testing::Message must access the custom << operator
-    // from the global namespace.  With this using declaration,
-    // overloads of << defined in the global namespace and those
-    // visible via Koenig lookup are both exposed in this function.
-    using ::operator <<;
-    *ss_ << val;
-    return *this;
-  }
-
-  // Streams a pointer value to this object.
-  //
-  // This function is an overload of the previous one.  When you
-  // stream a pointer to a Message, this definition will be used as it
-  // is more specialized.  (The C++ Standard, section
-  // [temp.func.order].)  If you stream a non-pointer, then the
-  // previous definition will be used.
-  //
-  // The reason for this overload is that streaming a NULL pointer to
-  // ostream is undefined behavior.  Depending on the compiler, you
-  // may get "0", "(nil)", "(null)", or an access violation.  To
-  // ensure consistent result across compilers, we always treat NULL
-  // as "(null)".
-  template <typename T>
-  inline Message& operator <<(T* const& pointer) {  // NOLINT
-    if (pointer == NULL) {
-      *ss_ << "(null)";
-    } else {
-      *ss_ << pointer;
-    }
-    return *this;
-  }
-#endif  // GTEST_OS_SYMBIAN
-
-  // Since the basic IO manipulators are overloaded for both narrow
-  // and wide streams, we have to provide this specialized definition
-  // of operator <<, even though its body is the same as the
-  // templatized version above.  Without this definition, streaming
-  // endl or other basic IO manipulators to Message will confuse the
-  // compiler.
-  Message& operator <<(BasicNarrowIoManip val) {
-    *ss_ << val;
-    return *this;
-  }
-
-  // Instead of 1/0, we want to see true/false for bool values.
-  Message& operator <<(bool b) {
-    return *this << (b ? "true" : "false");
-  }
-
-  // These two overloads allow streaming a wide C string to a Message
-  // using the UTF-8 encoding.
-  Message& operator <<(const wchar_t* wide_c_str);
-  Message& operator <<(wchar_t* wide_c_str);
-
-#if GTEST_HAS_STD_WSTRING
-  // Converts the given wide string to a narrow string using the UTF-8
-  // encoding, and streams the result to this Message object.
-  Message& operator <<(const ::std::wstring& wstr);
-#endif  // GTEST_HAS_STD_WSTRING
-
-#if GTEST_HAS_GLOBAL_WSTRING
-  // Converts the given wide string to a narrow string using the UTF-8
-  // encoding, and streams the result to this Message object.
-  Message& operator <<(const ::wstring& wstr);
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-  // Gets the text streamed to this object so far as an std::string.
-  // Each '\0' character in the buffer is replaced with "\\0".
-  //
-  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-  std::string GetString() const;
-
- private:
-
-#if GTEST_OS_SYMBIAN
-  // These are needed as the Nokia Symbian Compiler cannot decide between
-  // const T& and const T* in a function template. The Nokia compiler _can_
-  // decide between class template specializations for T and T*, so a
-  // tr1::type_traits-like is_pointer works, and we can overload on that.
-  template <typename T>
-  inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) {
-    if (pointer == NULL) {
-      *ss_ << "(null)";
-    } else {
-      *ss_ << pointer;
-    }
-  }
-  template <typename T>
-  inline void StreamHelper(internal::false_type /*is_pointer*/,
-                           const T& value) {
-    // See the comments in Message& operator <<(const T&) above for why
-    // we need this using statement.
-    using ::operator <<;
-    *ss_ << value;
-  }
-#endif  // GTEST_OS_SYMBIAN
-
-  // We'll hold the text streamed to this object here.
-  const internal::scoped_ptr< ::std::stringstream> ss_;
-
-  // We declare (but don't implement) this to prevent the compiler
-  // from implementing the assignment operator.
-  void operator=(const Message&);
-};
-
-// Streams a Message to an ostream.
-inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
-  return os << sb.GetString();
-}
-
-namespace internal {
-
-// Converts a streamable value to an std::string.  A NULL pointer is
-// converted to "(null)".  When the input value is a ::string,
-// ::std::string, ::wstring, or ::std::wstring object, each NUL
-// character in it is replaced with "\\0".
-template <typename T>
-std::string StreamableToString(const T& streamable) {
-  return (Message() << streamable).GetString();
-}
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file declares the String class and functions used internally by
-// Google Test.  They are subject to change without notice. They should not used
-// by code external to Google Test.
-//
-// This header file is #included by <gtest/internal/gtest-internal.h>.
-// It should not be #included by other files.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-
-#ifdef __BORLANDC__
-// string.h is not guaranteed to provide strcpy on C++ Builder.
-# include <mem.h>
-#endif
-
-#include <string.h>
-#include <string>
-
-
-namespace testing {
-namespace internal {
-
-// String - an abstract class holding static string utilities.
-class GTEST_API_ String {
- public:
-  // Static utility methods
-
-  // Clones a 0-terminated C string, allocating memory using new.  The
-  // caller is responsible for deleting the return value using
-  // delete[].  Returns the cloned string, or NULL if the input is
-  // NULL.
-  //
-  // This is different from strdup() in string.h, which allocates
-  // memory using malloc().
-  static const char* CloneCString(const char* c_str);
-
-#if GTEST_OS_WINDOWS_MOBILE
-  // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
-  // able to pass strings to Win32 APIs on CE we need to convert them
-  // to 'Unicode', UTF-16.
-
-  // Creates a UTF-16 wide string from the given ANSI string, allocating
-  // memory using new. The caller is responsible for deleting the return
-  // value using delete[]. Returns the wide string, or NULL if the
-  // input is NULL.
-  //
-  // The wide string is created using the ANSI codepage (CP_ACP) to
-  // match the behaviour of the ANSI versions of Win32 calls and the
-  // C runtime.
-  static LPCWSTR AnsiToUtf16(const char* c_str);
-
-  // Creates an ANSI string from the given wide string, allocating
-  // memory using new. The caller is responsible for deleting the return
-  // value using delete[]. Returns the ANSI string, or NULL if the
-  // input is NULL.
-  //
-  // The returned string is created using the ANSI codepage (CP_ACP) to
-  // match the behaviour of the ANSI versions of Win32 calls and the
-  // C runtime.
-  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
-#endif
-
-  // Compares two C strings.  Returns true iff they have the same content.
-  //
-  // Unlike strcmp(), this function can handle NULL argument(s).  A
-  // NULL C string is considered different to any non-NULL C string,
-  // including the empty string.
-  static bool CStringEquals(const char* lhs, const char* rhs);
-
-  // Converts a wide C string to a String using the UTF-8 encoding.
-  // NULL will be converted to "(null)".  If an error occurred during
-  // the conversion, "(failed to convert from wide string)" is
-  // returned.
-  static std::string ShowWideCString(const wchar_t* wide_c_str);
-
-  // Compares two wide C strings.  Returns true iff they have the same
-  // content.
-  //
-  // Unlike wcscmp(), this function can handle NULL argument(s).  A
-  // NULL C string is considered different to any non-NULL C string,
-  // including the empty string.
-  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
-
-  // Compares two C strings, ignoring case.  Returns true iff they
-  // have the same content.
-  //
-  // Unlike strcasecmp(), this function can handle NULL argument(s).
-  // A NULL C string is considered different to any non-NULL C string,
-  // including the empty string.
-  static bool CaseInsensitiveCStringEquals(const char* lhs,
-                                           const char* rhs);
-
-  // Compares two wide C strings, ignoring case.  Returns true iff they
-  // have the same content.
-  //
-  // Unlike wcscasecmp(), this function can handle NULL argument(s).
-  // A NULL C string is considered different to any non-NULL wide C string,
-  // including the empty string.
-  // NB: The implementations on different platforms slightly differ.
-  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
-  // environment variable. On GNU platform this method uses wcscasecmp
-  // which compares according to LC_CTYPE category of the current locale.
-  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
-  // current locale.
-  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
-                                               const wchar_t* rhs);
-
-  // Returns true iff the given string ends with the given suffix, ignoring
-  // case. Any string is considered to end with an empty suffix.
-  static bool EndsWithCaseInsensitive(
-      const std::string& str, const std::string& suffix);
-
-  // Formats an int value as "%02d".
-  static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
-
-  // Formats an int value as "%X".
-  static std::string FormatHexInt(int value);
-
-  // Formats a byte as "%02X".
-  static std::string FormatByte(unsigned char value);
-
- private:
-  String();  // Not meant to be instantiated.
-};  // class String
-
-// Gets the content of the stringstream's buffer as an std::string.  Each '\0'
-// character in the buffer is replaced with "\\0".
-GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: keith.ray@gmail.com (Keith Ray)
-//
-// Google Test filepath utilities
-//
-// This header file declares classes and functions used internally by
-// Google Test.  They are subject to change without notice.
-//
-// This file is #included in <gtest/internal/gtest-internal.h>.
-// Do not include this header file separately!
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-
-
-namespace testing {
-namespace internal {
-
-// FilePath - a class for file and directory pathname manipulation which
-// handles platform-specific conventions (like the pathname separator).
-// Used for helper functions for naming files in a directory for xml output.
-// Except for Set methods, all methods are const or static, which provides an
-// "immutable value object" -- useful for peace of mind.
-// A FilePath with a value ending in a path separator ("like/this/") represents
-// a directory, otherwise it is assumed to represent a file. In either case,
-// it may or may not represent an actual file or directory in the file system.
-// Names are NOT checked for syntax correctness -- no checking for illegal
-// characters, malformed paths, etc.
-
-class GTEST_API_ FilePath {
- public:
-  FilePath() : pathname_("") { }
-  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
-
-  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
-    Normalize();
-  }
-
-  FilePath& operator=(const FilePath& rhs) {
-    Set(rhs);
-    return *this;
-  }
-
-  void Set(const FilePath& rhs) {
-    pathname_ = rhs.pathname_;
-  }
-
-  const std::string& string() const { return pathname_; }
-  const char* c_str() const { return pathname_.c_str(); }
-
-  // Returns the current working directory, or "" if unsuccessful.
-  static FilePath GetCurrentDir();
-
-  // Given directory = "dir", base_name = "test", number = 0,
-  // extension = "xml", returns "dir/test.xml". If number is greater
-  // than zero (e.g., 12), returns "dir/test_12.xml".
-  // On Windows platform, uses \ as the separator rather than /.
-  static FilePath MakeFileName(const FilePath& directory,
-                               const FilePath& base_name,
-                               int number,
-                               const char* extension);
-
-  // Given directory = "dir", relative_path = "test.xml",
-  // returns "dir/test.xml".
-  // On Windows, uses \ as the separator rather than /.
-  static FilePath ConcatPaths(const FilePath& directory,
-                              const FilePath& relative_path);
-
-  // Returns a pathname for a file that does not currently exist. The pathname
-  // will be directory/base_name.extension or
-  // directory/base_name_<number>.extension if directory/base_name.extension
-  // already exists. The number will be incremented until a pathname is found
-  // that does not already exist.
-  // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
-  // There could be a race condition if two or more processes are calling this
-  // function at the same time -- they could both pick the same filename.
-  static FilePath GenerateUniqueFileName(const FilePath& directory,
-                                         const FilePath& base_name,
-                                         const char* extension);
-
-  // Returns true iff the path is "".
-  bool IsEmpty() const { return pathname_.empty(); }
-
-  // If input name has a trailing separator character, removes it and returns
-  // the name, otherwise return the name string unmodified.
-  // On Windows platform, uses \ as the separator, other platforms use /.
-  FilePath RemoveTrailingPathSeparator() const;
-
-  // Returns a copy of the FilePath with the directory part removed.
-  // Example: FilePath("path/to/file").RemoveDirectoryName() returns
-  // FilePath("file"). If there is no directory part ("just_a_file"), it returns
-  // the FilePath unmodified. If there is no file part ("just_a_dir/") it
-  // returns an empty FilePath ("").
-  // On Windows platform, '\' is the path separator, otherwise it is '/'.
-  FilePath RemoveDirectoryName() const;
-
-  // RemoveFileName returns the directory path with the filename removed.
-  // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
-  // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
-  // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
-  // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
-  // On Windows platform, '\' is the path separator, otherwise it is '/'.
-  FilePath RemoveFileName() const;
-
-  // Returns a copy of the FilePath with the case-insensitive extension removed.
-  // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
-  // FilePath("dir/file"). If a case-insensitive extension is not
-  // found, returns a copy of the original FilePath.
-  FilePath RemoveExtension(const char* extension) const;
-
-  // Creates directories so that path exists. Returns true if successful or if
-  // the directories already exist; returns false if unable to create
-  // directories for any reason. Will also return false if the FilePath does
-  // not represent a directory (that is, it doesn't end with a path separator).
-  bool CreateDirectoriesRecursively() const;
-
-  // Create the directory so that path exists. Returns true if successful or
-  // if the directory already exists; returns false if unable to create the
-  // directory for any reason, including if the parent directory does not
-  // exist. Not named "CreateDirectory" because that's a macro on Windows.
-  bool CreateFolder() const;
-
-  // Returns true if FilePath describes something in the file-system,
-  // either a file, directory, or whatever, and that something exists.
-  bool FileOrDirectoryExists() const;
-
-  // Returns true if pathname describes a directory in the file-system
-  // that exists.
-  bool DirectoryExists() const;
-
-  // Returns true if FilePath ends with a path separator, which indicates that
-  // it is intended to represent a directory. Returns false otherwise.
-  // This does NOT check that a directory (or file) actually exists.
-  bool IsDirectory() const;
-
-  // Returns true if pathname describes a root directory. (Windows has one
-  // root directory per disk drive.)
-  bool IsRootDirectory() const;
-
-  // Returns true if pathname describes an absolute path.
-  bool IsAbsolutePath() const;
-
- private:
-  // Replaces multiple consecutive separators with a single separator.
-  // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
-  // redundancies that might be in a pathname involving "." or "..".
-  //
-  // A pathname with multiple consecutive separators may occur either through
-  // user error or as a result of some scripts or APIs that generate a pathname
-  // with a trailing separator. On other platforms the same API or script
-  // may NOT generate a pathname with a trailing "/". Then elsewhere that
-  // pathname may have another "/" and pathname components added to it,
-  // without checking for the separator already being there.
-  // The script language and operating system may allow paths like "foo//bar"
-  // but some of the functions in FilePath will not handle that correctly. In
-  // particular, RemoveTrailingPathSeparator() only removes one separator, and
-  // it is called in CreateDirectoriesRecursively() assuming that it will change
-  // a pathname from directory syntax (trailing separator) to filename syntax.
-  //
-  // On Windows this method also replaces the alternate path separator '/' with
-  // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
-  // "bar\\foo".
-
-  void Normalize();
-
-  // Returns a pointer to the last occurence of a valid path separator in
-  // the FilePath. On Windows, for example, both '/' and '\' are valid path
-  // separators. Returns NULL if no path separator was found.
-  const char* FindLastPathSeparator() const;
-
-  std::string pathname_;
-};  // class FilePath
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-// This file was GENERATED by command:
-//     pump.py gtest-type-util.h.pump
-// DO NOT EDIT BY HAND!!!
-
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Type utilities needed for implementing typed and type-parameterized
-// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// Currently we support at most 50 types in a list, and at most 50
-// type-parameterized tests in one type-parameterized test case.
-// Please contact googletestframework@googlegroups.com if you need
-// more.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-
-
-// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
-// libstdc++ (which is where cxxabi.h comes from).
-# if GTEST_HAS_CXXABI_H_
-#  include <cxxabi.h>
-# elif defined(__HP_aCC)
-#  include <acxx_demangle.h>
-# endif  // GTEST_HASH_CXXABI_H_
-
-namespace testing {
-namespace internal {
-
-// GetTypeName<T>() returns a human-readable name of type T.
-// NB: This function is also used in Google Mock, so don't move it inside of
-// the typed-test-only section below.
-template <typename T>
-std::string GetTypeName() {
-# if GTEST_HAS_RTTI
-
-  const char* const name = typeid(T).name();
-#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
-  int status = 0;
-  // gcc's implementation of typeid(T).name() mangles the type name,
-  // so we have to demangle it.
-#   if GTEST_HAS_CXXABI_H_
-  using abi::__cxa_demangle;
-#   endif  // GTEST_HAS_CXXABI_H_
-  char* const readable_name = __cxa_demangle(name, 0, 0, &status);
-  const std::string name_str(status == 0 ? readable_name : name);
-  free(readable_name);
-  return name_str;
-#  else
-  return name;
-#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
-
-# else
-
-  return "<type>";
-
-# endif  // GTEST_HAS_RTTI
-}
-
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
-// type.  This can be used as a compile-time assertion to ensure that
-// two types are equal.
-
-template <typename T1, typename T2>
-struct AssertTypeEq;
-
-template <typename T>
-struct AssertTypeEq<T, T> {
-  typedef bool type;
-};
-
-// A unique type used as the default value for the arguments of class
-// template Types.  This allows us to simulate variadic templates
-// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
-// support directly.
-struct None {};
-
-// The following family of struct and struct templates are used to
-// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
-// represents a type list with N types (T1, T2, ..., and TN) in it.
-// Except for Types0, every struct in the family has two member types:
-// Head for the first type in the list, and Tail for the rest of the
-// list.
-
-// The empty type list.
-struct Types0 {};
-
-// Type lists of length 1, 2, 3, and so on.
-
-template <typename T1>
-struct Types1 {
-  typedef T1 Head;
-  typedef Types0 Tail;
-};
-template <typename T1, typename T2>
-struct Types2 {
-  typedef T1 Head;
-  typedef Types1<T2> Tail;
-};
-
-template <typename T1, typename T2, typename T3>
-struct Types3 {
-  typedef T1 Head;
-  typedef Types2<T2, T3> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4>
-struct Types4 {
-  typedef T1 Head;
-  typedef Types3<T2, T3, T4> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-struct Types5 {
-  typedef T1 Head;
-  typedef Types4<T2, T3, T4, T5> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-struct Types6 {
-  typedef T1 Head;
-  typedef Types5<T2, T3, T4, T5, T6> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-struct Types7 {
-  typedef T1 Head;
-  typedef Types6<T2, T3, T4, T5, T6, T7> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-struct Types8 {
-  typedef T1 Head;
-  typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-struct Types9 {
-  typedef T1 Head;
-  typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-struct Types10 {
-  typedef T1 Head;
-  typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-struct Types11 {
-  typedef T1 Head;
-  typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-struct Types12 {
-  typedef T1 Head;
-  typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-struct Types13 {
-  typedef T1 Head;
-  typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-struct Types14 {
-  typedef T1 Head;
-  typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-struct Types15 {
-  typedef T1 Head;
-  typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-struct Types16 {
-  typedef T1 Head;
-  typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-struct Types17 {
-  typedef T1 Head;
-  typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-struct Types18 {
-  typedef T1 Head;
-  typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-struct Types19 {
-  typedef T1 Head;
-  typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-struct Types20 {
-  typedef T1 Head;
-  typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-struct Types21 {
-  typedef T1 Head;
-  typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-struct Types22 {
-  typedef T1 Head;
-  typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-struct Types23 {
-  typedef T1 Head;
-  typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-struct Types24 {
-  typedef T1 Head;
-  typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-struct Types25 {
-  typedef T1 Head;
-  typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-struct Types26 {
-  typedef T1 Head;
-  typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-struct Types27 {
-  typedef T1 Head;
-  typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-struct Types28 {
-  typedef T1 Head;
-  typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-struct Types29 {
-  typedef T1 Head;
-  typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-struct Types30 {
-  typedef T1 Head;
-  typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-struct Types31 {
-  typedef T1 Head;
-  typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-struct Types32 {
-  typedef T1 Head;
-  typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-struct Types33 {
-  typedef T1 Head;
-  typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-struct Types34 {
-  typedef T1 Head;
-  typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-struct Types35 {
-  typedef T1 Head;
-  typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-struct Types36 {
-  typedef T1 Head;
-  typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-struct Types37 {
-  typedef T1 Head;
-  typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-struct Types38 {
-  typedef T1 Head;
-  typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-struct Types39 {
-  typedef T1 Head;
-  typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-struct Types40 {
-  typedef T1 Head;
-  typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-struct Types41 {
-  typedef T1 Head;
-  typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-struct Types42 {
-  typedef T1 Head;
-  typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-struct Types43 {
-  typedef T1 Head;
-  typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-struct Types44 {
-  typedef T1 Head;
-  typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-struct Types45 {
-  typedef T1 Head;
-  typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-struct Types46 {
-  typedef T1 Head;
-  typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-struct Types47 {
-  typedef T1 Head;
-  typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-struct Types48 {
-  typedef T1 Head;
-  typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-struct Types49 {
-  typedef T1 Head;
-  typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48, T49> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-struct Types50 {
-  typedef T1 Head;
-  typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48, T49, T50> Tail;
-};
-
-
-}  // namespace internal
-
-// We don't want to require the users to write TypesN<...> directly,
-// as that would require them to count the length.  Types<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Types<int>
-// will appear as Types<int, None, None, ..., None> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Types<T1, ..., TN>, and Google Test will translate
-// that to TypesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Types template.
-template <typename T1 = internal::None, typename T2 = internal::None,
-    typename T3 = internal::None, typename T4 = internal::None,
-    typename T5 = internal::None, typename T6 = internal::None,
-    typename T7 = internal::None, typename T8 = internal::None,
-    typename T9 = internal::None, typename T10 = internal::None,
-    typename T11 = internal::None, typename T12 = internal::None,
-    typename T13 = internal::None, typename T14 = internal::None,
-    typename T15 = internal::None, typename T16 = internal::None,
-    typename T17 = internal::None, typename T18 = internal::None,
-    typename T19 = internal::None, typename T20 = internal::None,
-    typename T21 = internal::None, typename T22 = internal::None,
-    typename T23 = internal::None, typename T24 = internal::None,
-    typename T25 = internal::None, typename T26 = internal::None,
-    typename T27 = internal::None, typename T28 = internal::None,
-    typename T29 = internal::None, typename T30 = internal::None,
-    typename T31 = internal::None, typename T32 = internal::None,
-    typename T33 = internal::None, typename T34 = internal::None,
-    typename T35 = internal::None, typename T36 = internal::None,
-    typename T37 = internal::None, typename T38 = internal::None,
-    typename T39 = internal::None, typename T40 = internal::None,
-    typename T41 = internal::None, typename T42 = internal::None,
-    typename T43 = internal::None, typename T44 = internal::None,
-    typename T45 = internal::None, typename T46 = internal::None,
-    typename T47 = internal::None, typename T48 = internal::None,
-    typename T49 = internal::None, typename T50 = internal::None>
-struct Types {
-  typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
-};
-
-template <>
-struct Types<internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types0 type;
-};
-template <typename T1>
-struct Types<T1, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types1<T1> type;
-};
-template <typename T1, typename T2>
-struct Types<T1, T2, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types2<T1, T2> type;
-};
-template <typename T1, typename T2, typename T3>
-struct Types<T1, T2, T3, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types3<T1, T2, T3> type;
-};
-template <typename T1, typename T2, typename T3, typename T4>
-struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types4<T1, T2, T3, T4> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-struct Types<T1, T2, T3, T4, T5, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types5<T1, T2, T3, T4, T5> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types6<T1, T2, T3, T4, T5, T6> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, internal::None, internal::None, internal::None> {
-  typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, T48, internal::None, internal::None> {
-  typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, T48, T49, internal::None> {
-  typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49> type;
-};
-
-namespace internal {
-
-# define GTEST_TEMPLATE_ template <typename T> class
-
-// The template "selector" struct TemplateSel<Tmpl> is used to
-// represent Tmpl, which must be a class template with one type
-// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
-// as the type Tmpl<T>.  This allows us to actually instantiate the
-// template "selected" by TemplateSel<Tmpl>.
-//
-// This trick is necessary for simulating typedef for class templates,
-// which C++ doesn't support directly.
-template <GTEST_TEMPLATE_ Tmpl>
-struct TemplateSel {
-  template <typename T>
-  struct Bind {
-    typedef Tmpl<T> type;
-  };
-};
-
-# define GTEST_BIND_(TmplSel, T) \
-  TmplSel::template Bind<T>::type
-
-// A unique struct template used as the default value for the
-// arguments of class template Templates.  This allows us to simulate
-// variadic templates (e.g. Templates<int>, Templates<int, double>,
-// and etc), which C++ doesn't support directly.
-template <typename T>
-struct NoneT {};
-
-// The following family of struct and struct templates are used to
-// represent template lists.  In particular, TemplatesN<T1, T2, ...,
-// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
-// for Templates0, every struct in the family has two member types:
-// Head for the selector of the first template in the list, and Tail
-// for the rest of the list.
-
-// The empty template list.
-struct Templates0 {};
-
-// Template lists of length 1, 2, 3, and so on.
-
-template <GTEST_TEMPLATE_ T1>
-struct Templates1 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates0 Tail;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
-struct Templates2 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates1<T2> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
-struct Templates3 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates2<T2, T3> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4>
-struct Templates4 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates3<T2, T3, T4> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
-struct Templates5 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates4<T2, T3, T4, T5> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
-struct Templates6 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates5<T2, T3, T4, T5, T6> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7>
-struct Templates7 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates6<T2, T3, T4, T5, T6, T7> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
-struct Templates8 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
-struct Templates9 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10>
-struct Templates10 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
-struct Templates11 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
-struct Templates12 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13>
-struct Templates13 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
-struct Templates14 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
-struct Templates15 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16>
-struct Templates16 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
-struct Templates17 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
-struct Templates18 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19>
-struct Templates19 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
-struct Templates20 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
-struct Templates21 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22>
-struct Templates22 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
-struct Templates23 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
-struct Templates24 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25>
-struct Templates25 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
-struct Templates26 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
-struct Templates27 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28>
-struct Templates28 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
-struct Templates29 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
-struct Templates30 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31>
-struct Templates31 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
-struct Templates32 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
-struct Templates33 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34>
-struct Templates34 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
-struct Templates35 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
-struct Templates36 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37>
-struct Templates37 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
-struct Templates38 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
-struct Templates39 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40>
-struct Templates40 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
-struct Templates41 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
-struct Templates42 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43>
-struct Templates43 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
-struct Templates44 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
-struct Templates45 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46>
-struct Templates46 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
-struct Templates47 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
-struct Templates48 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49>
-struct Templates49 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48, T49> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50>
-struct Templates50 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48, T49, T50> Tail;
-};
-
-
-// We don't want to require the users to write TemplatesN<...> directly,
-// as that would require them to count the length.  Templates<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Templates<list>
-// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Templates<T1, ..., TN>, and Google Test will translate
-// that to TemplatesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Templates template.
-template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT,
-    GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT,
-    GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT,
-    GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT,
-    GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT,
-    GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT,
-    GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT,
-    GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT,
-    GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT,
-    GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT,
-    GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT,
-    GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT,
-    GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT,
-    GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT,
-    GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT,
-    GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT,
-    GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT,
-    GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT,
-    GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT,
-    GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT,
-    GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT,
-    GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT,
-    GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT,
-    GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT,
-    GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT>
-struct Templates {
-  typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
-};
-
-template <>
-struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates0 type;
-};
-template <GTEST_TEMPLATE_ T1>
-struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates1<T1> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
-struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates2<T1, T2> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
-struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates3<T1, T2, T3> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4>
-struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates4<T1, T2, T3, T4> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
-struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates5<T1, T2, T3, T4, T5> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
-struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates6<T1, T2, T3, T4, T5, T6> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, NoneT, NoneT, NoneT> {
-  typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, T48, NoneT, NoneT> {
-  typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, T48, T49, NoneT> {
-  typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48, T49> type;
-};
-
-// The TypeList template makes it possible to use either a single type
-// or a Types<...> list in TYPED_TEST_CASE() and
-// INSTANTIATE_TYPED_TEST_CASE_P().
-
-template <typename T>
-struct TypeList {
-  typedef Types1<T> type;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48, T49, T50> > {
-  typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type;
-};
-
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-
-// Due to C++ preprocessor weirdness, we need double indirection to
-// concatenate two tokens when one of them is __LINE__.  Writing
-//
-//   foo ## __LINE__
-//
-// will result in the token foo__LINE__, instead of foo followed by
-// the current line number.  For more details, see
-// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
-#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
-#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
-
-class ProtocolMessage;
-namespace proto2 { class Message; }
-
-namespace testing {
-
-// Forward declarations.
-
-class AssertionResult;                 // Result of an assertion.
-class Message;                         // Represents a failure message.
-class Test;                            // Represents a test.
-class TestInfo;                        // Information about a test.
-class TestPartResult;                  // Result of a test part.
-class UnitTest;                        // A collection of test cases.
-
-template <typename T>
-::std::string PrintToString(const T& value);
-
-namespace internal {
-
-struct TraceInfo;                      // Information about a trace point.
-class ScopedTrace;                     // Implements scoped trace.
-class TestInfoImpl;                    // Opaque implementation of TestInfo
-class UnitTestImpl;                    // Opaque implementation of UnitTest
-
-// How many times InitGoogleTest() has been called.
-GTEST_API_ extern int g_init_gtest_count;
-
-// The text used in failure messages to indicate the start of the
-// stack trace.
-GTEST_API_ extern const char kStackTraceMarker[];
-
-// Two overloaded helpers for checking at compile time whether an
-// expression is a null pointer literal (i.e. NULL or any 0-valued
-// compile-time integral constant).  Their return values have
-// different sizes, so we can use sizeof() to test which version is
-// picked by the compiler.  These helpers have no implementations, as
-// we only need their signatures.
-//
-// Given IsNullLiteralHelper(x), the compiler will pick the first
-// version if x can be implicitly converted to Secret*, and pick the
-// second version otherwise.  Since Secret is a secret and incomplete
-// type, the only expression a user can write that has type Secret* is
-// a null pointer literal.  Therefore, we know that x is a null
-// pointer literal if and only if the first version is picked by the
-// compiler.
-char IsNullLiteralHelper(Secret* p);
-char (&IsNullLiteralHelper(...))[2];  // NOLINT
-
-// A compile-time bool constant that is true if and only if x is a
-// null pointer literal (i.e. NULL or any 0-valued compile-time
-// integral constant).
-#ifdef GTEST_ELLIPSIS_NEEDS_POD_
-// We lose support for NULL detection where the compiler doesn't like
-// passing non-POD classes through ellipsis (...).
-# define GTEST_IS_NULL_LITERAL_(x) false
-#else
-# define GTEST_IS_NULL_LITERAL_(x) \
-    (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1)
-#endif  // GTEST_ELLIPSIS_NEEDS_POD_
-
-// Appends the user-supplied message to the Google-Test-generated message.
-GTEST_API_ std::string AppendUserMessage(
-    const std::string& gtest_msg, const Message& user_msg);
-
-#if GTEST_HAS_EXCEPTIONS
-
-// This exception is thrown by (and only by) a failed Google Test
-// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
-// are enabled).  We derive it from std::runtime_error, which is for
-// errors presumably detectable only at run time.  Since
-// std::runtime_error inherits from std::exception, many testing
-// frameworks know how to extract and print the message inside it.
-class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
- public:
-  explicit GoogleTestFailureException(const TestPartResult& failure);
-};
-
-#endif  // GTEST_HAS_EXCEPTIONS
-
-// A helper class for creating scoped traces in user programs.
-class GTEST_API_ ScopedTrace {
- public:
-  // The c'tor pushes the given source file location and message onto
-  // a trace stack maintained by Google Test.
-  ScopedTrace(const char* file, int line, const Message& message);
-
-  // The d'tor pops the info pushed by the c'tor.
-  //
-  // Note that the d'tor is not virtual in order to be efficient.
-  // Don't inherit from ScopedTrace!
-  ~ScopedTrace();
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
-} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
-                            // c'tor and d'tor.  Therefore it doesn't
-                            // need to be used otherwise.
-
-// Constructs and returns the message for an equality assertion
-// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
-//
-// The first four parameters are the expressions used in the assertion
-// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
-// where foo is 5 and bar is 6, we have:
-//
-//   expected_expression: "foo"
-//   actual_expression:   "bar"
-//   expected_value:      "5"
-//   actual_value:        "6"
-//
-// The ignoring_case parameter is true iff the assertion is a
-// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
-// be inserted into the message.
-GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
-                                     const char* actual_expression,
-                                     const std::string& expected_value,
-                                     const std::string& actual_value,
-                                     bool ignoring_case);
-
-// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
-GTEST_API_ std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value);
-
-// This template class represents an IEEE floating-point number
-// (either single-precision or double-precision, depending on the
-// template parameters).
-//
-// The purpose of this class is to do more sophisticated number
-// comparison.  (Due to round-off error, etc, it's very unlikely that
-// two floating-points will be equal exactly.  Hence a naive
-// comparison by the == operation often doesn't work.)
-//
-// Format of IEEE floating-point:
-//
-//   The most-significant bit being the leftmost, an IEEE
-//   floating-point looks like
-//
-//     sign_bit exponent_bits fraction_bits
-//
-//   Here, sign_bit is a single bit that designates the sign of the
-//   number.
-//
-//   For float, there are 8 exponent bits and 23 fraction bits.
-//
-//   For double, there are 11 exponent bits and 52 fraction bits.
-//
-//   More details can be found at
-//   http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
-//
-// Template parameter:
-//
-//   RawType: the raw floating-point type (either float or double)
-template <typename RawType>
-class FloatingPoint {
- public:
-  // Defines the unsigned integer type that has the same size as the
-  // floating point number.
-  typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
-
-  // Constants.
-
-  // # of bits in a number.
-  static const size_t kBitCount = 8*sizeof(RawType);
-
-  // # of fraction bits in a number.
-  static const size_t kFractionBitCount =
-    std::numeric_limits<RawType>::digits - 1;
-
-  // # of exponent bits in a number.
-  static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
-
-  // The mask for the sign bit.
-  static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
-
-  // The mask for the fraction bits.
-  static const Bits kFractionBitMask =
-    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
-
-  // The mask for the exponent bits.
-  static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
-
-  // How many ULP's (Units in the Last Place) we want to tolerate when
-  // comparing two numbers.  The larger the value, the more error we
-  // allow.  A 0 value means that two numbers must be exactly the same
-  // to be considered equal.
-  //
-  // The maximum error of a single floating-point operation is 0.5
-  // units in the last place.  On Intel CPU's, all floating-point
-  // calculations are done with 80-bit precision, while double has 64
-  // bits.  Therefore, 4 should be enough for ordinary use.
-  //
-  // See the following article for more details on ULP:
-  // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
-  static const size_t kMaxUlps = 4;
-
-  // Constructs a FloatingPoint from a raw floating-point number.
-  //
-  // On an Intel CPU, passing a non-normalized NAN (Not a Number)
-  // around may change its bits, although the new value is guaranteed
-  // to be also a NAN.  Therefore, don't expect this constructor to
-  // preserve the bits in x when x is a NAN.
-  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
-
-  // Static methods
-
-  // Reinterprets a bit pattern as a floating-point number.
-  //
-  // This function is needed to test the AlmostEquals() method.
-  static RawType ReinterpretBits(const Bits bits) {
-    FloatingPoint fp(0);
-    fp.u_.bits_ = bits;
-    return fp.u_.value_;
-  }
-
-  // Returns the floating-point number that represent positive infinity.
-  static RawType Infinity() {
-    return ReinterpretBits(kExponentBitMask);
-  }
-
-  // Returns the maximum representable finite floating-point number.
-  static RawType Max();
-
-  // Non-static methods
-
-  // Returns the bits that represents this number.
-  const Bits &bits() const { return u_.bits_; }
-
-  // Returns the exponent bits of this number.
-  Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
-
-  // Returns the fraction bits of this number.
-  Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
-
-  // Returns the sign bit of this number.
-  Bits sign_bit() const { return kSignBitMask & u_.bits_; }
-
-  // Returns true iff this is NAN (not a number).
-  bool is_nan() const {
-    // It's a NAN if the exponent bits are all ones and the fraction
-    // bits are not entirely zeros.
-    return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
-  }
-
-  // Returns true iff this number is at most kMaxUlps ULP's away from
-  // rhs.  In particular, this function:
-  //
-  //   - returns false if either number is (or both are) NAN.
-  //   - treats really large numbers as almost equal to infinity.
-  //   - thinks +0.0 and -0.0 are 0 DLP's apart.
-  bool AlmostEquals(const FloatingPoint& rhs) const {
-    // The IEEE standard says that any comparison operation involving
-    // a NAN must return false.
-    if (is_nan() || rhs.is_nan()) return false;
-
-    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
-        <= kMaxUlps;
-  }
-
- private:
-  // The data type used to store the actual floating-point number.
-  union FloatingPointUnion {
-    RawType value_;  // The raw floating-point number.
-    Bits bits_;      // The bits that represent the number.
-  };
-
-  // Converts an integer from the sign-and-magnitude representation to
-  // the biased representation.  More precisely, let N be 2 to the
-  // power of (kBitCount - 1), an integer x is represented by the
-  // unsigned number x + N.
-  //
-  // For instance,
-  //
-  //   -N + 1 (the most negative number representable using
-  //          sign-and-magnitude) is represented by 1;
-  //   0      is represented by N; and
-  //   N - 1  (the biggest number representable using
-  //          sign-and-magnitude) is represented by 2N - 1.
-  //
-  // Read http://en.wikipedia.org/wiki/Signed_number_representations
-  // for more details on signed number representations.
-  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
-    if (kSignBitMask & sam) {
-      // sam represents a negative number.
-      return ~sam + 1;
-    } else {
-      // sam represents a positive number.
-      return kSignBitMask | sam;
-    }
-  }
-
-  // Given two numbers in the sign-and-magnitude representation,
-  // returns the distance between them as an unsigned number.
-  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
-                                                     const Bits &sam2) {
-    const Bits biased1 = SignAndMagnitudeToBiased(sam1);
-    const Bits biased2 = SignAndMagnitudeToBiased(sam2);
-    return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
-  }
-
-  FloatingPointUnion u_;
-};
-
-// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
-// macro defined by <windows.h>.
-template <>
-inline float FloatingPoint<float>::Max() { return FLT_MAX; }
-template <>
-inline double FloatingPoint<double>::Max() { return DBL_MAX; }
-
-// Typedefs the instances of the FloatingPoint template class that we
-// care to use.
-typedef FloatingPoint<float> Float;
-typedef FloatingPoint<double> Double;
-
-// In order to catch the mistake of putting tests that use different
-// test fixture classes in the same test case, we need to assign
-// unique IDs to fixture classes and compare them.  The TypeId type is
-// used to hold such IDs.  The user should treat TypeId as an opaque
-// type: the only operation allowed on TypeId values is to compare
-// them for equality using the == operator.
-typedef const void* TypeId;
-
-template <typename T>
-class TypeIdHelper {
- public:
-  // dummy_ must not have a const type.  Otherwise an overly eager
-  // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
-  // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
-  static bool dummy_;
-};
-
-template <typename T>
-bool TypeIdHelper<T>::dummy_ = false;
-
-// GetTypeId<T>() returns the ID of type T.  Different values will be
-// returned for different types.  Calling the function twice with the
-// same type argument is guaranteed to return the same ID.
-template <typename T>
-TypeId GetTypeId() {
-  // The compiler is required to allocate a different
-  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
-  // the template.  Therefore, the address of dummy_ is guaranteed to
-  // be unique.
-  return &(TypeIdHelper<T>::dummy_);
-}
-
-// Returns the type ID of ::testing::Test.  Always call this instead
-// of GetTypeId< ::testing::Test>() to get the type ID of
-// ::testing::Test, as the latter may give the wrong result due to a
-// suspected linker bug when compiling Google Test as a Mac OS X
-// framework.
-GTEST_API_ TypeId GetTestTypeId();
-
-// Defines the abstract factory interface that creates instances
-// of a Test object.
-class TestFactoryBase {
- public:
-  virtual ~TestFactoryBase() {}
-
-  // Creates a test instance to run. The instance is both created and destroyed
-  // within TestInfoImpl::Run()
-  virtual Test* CreateTest() = 0;
-
- protected:
-  TestFactoryBase() {}
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
-};
-
-// This class provides implementation of TeastFactoryBase interface.
-// It is used in TEST and TEST_F macros.
-template <class TestClass>
-class TestFactoryImpl : public TestFactoryBase {
- public:
-  virtual Test* CreateTest() { return new TestClass; }
-};
-
-#if GTEST_OS_WINDOWS
-
-// Predicate-formatters for implementing the HRESULT checking macros
-// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
-// We pass a long instead of HRESULT to avoid causing an
-// include dependency for the HRESULT type.
-GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
-                                            long hr);  // NOLINT
-GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
-                                            long hr);  // NOLINT
-
-#endif  // GTEST_OS_WINDOWS
-
-// Types of SetUpTestCase() and TearDownTestCase() functions.
-typedef void (*SetUpTestCaseFunc)();
-typedef void (*TearDownTestCaseFunc)();
-
-// Creates a new TestInfo object and registers it with Google Test;
-// returns the created object.
-//
-// Arguments:
-//
-//   test_case_name:   name of the test case
-//   name:             name of the test
-//   type_param        the name of the test's type parameter, or NULL if
-//                     this is not a typed or a type-parameterized test.
-//   value_param       text representation of the test's value parameter,
-//                     or NULL if this is not a type-parameterized test.
-//   fixture_class_id: ID of the test fixture class
-//   set_up_tc:        pointer to the function that sets up the test case
-//   tear_down_tc:     pointer to the function that tears down the test case
-//   factory:          pointer to the factory that creates a test object.
-//                     The newly created TestInfo instance will assume
-//                     ownership of the factory object.
-GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
-    const char* test_case_name,
-    const char* name,
-    const char* type_param,
-    const char* value_param,
-    TypeId fixture_class_id,
-    SetUpTestCaseFunc set_up_tc,
-    TearDownTestCaseFunc tear_down_tc,
-    TestFactoryBase* factory);
-
-// If *pstr starts with the given prefix, modifies *pstr to be right
-// past the prefix and returns true; otherwise leaves *pstr unchanged
-// and returns false.  None of pstr, *pstr, and prefix can be NULL.
-GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
-
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// State of the definition of a type-parameterized test case.
-class GTEST_API_ TypedTestCasePState {
- public:
-  TypedTestCasePState() : registered_(false) {}
-
-  // Adds the given test name to defined_test_names_ and return true
-  // if the test case hasn't been registered; otherwise aborts the
-  // program.
-  bool AddTestName(const char* file, int line, const char* case_name,
-                   const char* test_name) {
-    if (registered_) {
-      fprintf(stderr, "%s Test %s must be defined before "
-              "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n",
-              FormatFileLocation(file, line).c_str(), test_name, case_name);
-      fflush(stderr);
-      posix::Abort();
-    }
-    defined_test_names_.insert(test_name);
-    return true;
-  }
-
-  // Verifies that registered_tests match the test names in
-  // defined_test_names_; returns registered_tests if successful, or
-  // aborts the program otherwise.
-  const char* VerifyRegisteredTestNames(
-      const char* file, int line, const char* registered_tests);
-
- private:
-  bool registered_;
-  ::std::set<const char*> defined_test_names_;
-};
-
-// Skips to the first non-space char after the first comma in 'str';
-// returns NULL if no comma is found in 'str'.
-inline const char* SkipComma(const char* str) {
-  const char* comma = strchr(str, ',');
-  if (comma == NULL) {
-    return NULL;
-  }
-  while (IsSpace(*(++comma))) {}
-  return comma;
-}
-
-// Returns the prefix of 'str' before the first comma in it; returns
-// the entire string if it contains no comma.
-inline std::string GetPrefixUntilComma(const char* str) {
-  const char* comma = strchr(str, ',');
-  return comma == NULL ? str : std::string(str, comma);
-}
-
-// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
-// registers a list of type-parameterized tests with Google Test.  The
-// return value is insignificant - we just need to return something
-// such that we can call this function in a namespace scope.
-//
-// Implementation note: The GTEST_TEMPLATE_ macro declares a template
-// template parameter.  It's defined in gtest-type-util.h.
-template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
-class TypeParameterizedTest {
- public:
-  // 'index' is the index of the test in the type list 'Types'
-  // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
-  // Types).  Valid values for 'index' are [0, N - 1] where N is the
-  // length of Types.
-  static bool Register(const char* prefix, const char* case_name,
-                       const char* test_names, int index) {
-    typedef typename Types::Head Type;
-    typedef Fixture<Type> FixtureClass;
-    typedef typename GTEST_BIND_(TestSel, Type) TestClass;
-
-    // First, registers the first type-parameterized test in the type
-    // list.
-    MakeAndRegisterTestInfo(
-        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/"
-         + StreamableToString(index)).c_str(),
-        GetPrefixUntilComma(test_names).c_str(),
-        GetTypeName<Type>().c_str(),
-        NULL,  // No value parameter.
-        GetTypeId<FixtureClass>(),
-        TestClass::SetUpTestCase,
-        TestClass::TearDownTestCase,
-        new TestFactoryImpl<TestClass>);
-
-    // Next, recurses (at compile time) with the tail of the type list.
-    return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail>
-        ::Register(prefix, case_name, test_names, index + 1);
-  }
-};
-
-// The base case for the compile time recursion.
-template <GTEST_TEMPLATE_ Fixture, class TestSel>
-class TypeParameterizedTest<Fixture, TestSel, Types0> {
- public:
-  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
-                       const char* /*test_names*/, int /*index*/) {
-    return true;
-  }
-};
-
-// TypeParameterizedTestCase<Fixture, Tests, Types>::Register()
-// registers *all combinations* of 'Tests' and 'Types' with Google
-// Test.  The return value is insignificant - we just need to return
-// something such that we can call this function in a namespace scope.
-template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
-class TypeParameterizedTestCase {
- public:
-  static bool Register(const char* prefix, const char* case_name,
-                       const char* test_names) {
-    typedef typename Tests::Head Head;
-
-    // First, register the first test in 'Test' for each type in 'Types'.
-    TypeParameterizedTest<Fixture, Head, Types>::Register(
-        prefix, case_name, test_names, 0);
-
-    // Next, recurses (at compile time) with the tail of the test list.
-    return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types>
-        ::Register(prefix, case_name, SkipComma(test_names));
-  }
-};
-
-// The base case for the compile time recursion.
-template <GTEST_TEMPLATE_ Fixture, typename Types>
-class TypeParameterizedTestCase<Fixture, Templates0, Types> {
- public:
-  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
-                       const char* /*test_names*/) {
-    return true;
-  }
-};
-
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// Returns the current OS stack trace as an std::string.
-//
-// The maximum number of stack frames to be included is specified by
-// the gtest_stack_trace_depth flag.  The skip_count parameter
-// specifies the number of top frames to be skipped, which doesn't
-// count against the number of frames to be included.
-//
-// For example, if Foo() calls Bar(), which in turn calls
-// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
-// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
-    UnitTest* unit_test, int skip_count);
-
-// Helpers for suppressing warnings on unreachable code or constant
-// condition.
-
-// Always returns true.
-GTEST_API_ bool AlwaysTrue();
-
-// Always returns false.
-inline bool AlwaysFalse() { return !AlwaysTrue(); }
-
-// Helper for suppressing false warning from Clang on a const char*
-// variable declared in a conditional expression always being NULL in
-// the else branch.
-struct GTEST_API_ ConstCharPtr {
-  ConstCharPtr(const char* str) : value(str) {}
-  operator bool() const { return true; }
-  const char* value;
-};
-
-// A simple Linear Congruential Generator for generating random
-// numbers with a uniform distribution.  Unlike rand() and srand(), it
-// doesn't use global state (and therefore can't interfere with user
-// code).  Unlike rand_r(), it's portable.  An LCG isn't very random,
-// but it's good enough for our purposes.
-class GTEST_API_ Random {
- public:
-  static const UInt32 kMaxRange = 1u << 31;
-
-  explicit Random(UInt32 seed) : state_(seed) {}
-
-  void Reseed(UInt32 seed) { state_ = seed; }
-
-  // Generates a random number from [0, range).  Crashes if 'range' is
-  // 0 or greater than kMaxRange.
-  UInt32 Generate(UInt32 range);
-
- private:
-  UInt32 state_;
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
-};
-
-// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a
-// compiler error iff T1 and T2 are different types.
-template <typename T1, typename T2>
-struct CompileAssertTypesEqual;
-
-template <typename T>
-struct CompileAssertTypesEqual<T, T> {
-};
-
-// Removes the reference from a type if it is a reference type,
-// otherwise leaves it unchanged.  This is the same as
-// tr1::remove_reference, which is not widely available yet.
-template <typename T>
-struct RemoveReference { typedef T type; };  // NOLINT
-template <typename T>
-struct RemoveReference<T&> { typedef T type; };  // NOLINT
-
-// A handy wrapper around RemoveReference that works when the argument
-// T depends on template parameters.
-#define GTEST_REMOVE_REFERENCE_(T) \
-    typename ::testing::internal::RemoveReference<T>::type
-
-// Removes const from a type if it is a const type, otherwise leaves
-// it unchanged.  This is the same as tr1::remove_const, which is not
-// widely available yet.
-template <typename T>
-struct RemoveConst { typedef T type; };  // NOLINT
-template <typename T>
-struct RemoveConst<const T> { typedef T type; };  // NOLINT
-
-// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above
-// definition to fail to remove the const in 'const int[3]' and 'const
-// char[3][4]'.  The following specialization works around the bug.
-template <typename T, size_t N>
-struct RemoveConst<const T[N]> {
-  typedef typename RemoveConst<T>::type type[N];
-};
-
-#if defined(_MSC_VER) && _MSC_VER < 1400
-// This is the only specialization that allows VC++ 7.1 to remove const in
-// 'const int[3] and 'const int[3][4]'.  However, it causes trouble with GCC
-// and thus needs to be conditionally compiled.
-template <typename T, size_t N>
-struct RemoveConst<T[N]> {
-  typedef typename RemoveConst<T>::type type[N];
-};
-#endif
-
-// A handy wrapper around RemoveConst that works when the argument
-// T depends on template parameters.
-#define GTEST_REMOVE_CONST_(T) \
-    typename ::testing::internal::RemoveConst<T>::type
-
-// Turns const U&, U&, const U, and U all into U.
-#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
-    GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
-
-// Adds reference to a type if it is not a reference type,
-// otherwise leaves it unchanged.  This is the same as
-// tr1::add_reference, which is not widely available yet.
-template <typename T>
-struct AddReference { typedef T& type; };  // NOLINT
-template <typename T>
-struct AddReference<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper around AddReference that works when the argument T
-// depends on template parameters.
-#define GTEST_ADD_REFERENCE_(T) \
-    typename ::testing::internal::AddReference<T>::type
-
-// Adds a reference to const on top of T as necessary.  For example,
-// it transforms
-//
-//   char         ==> const char&
-//   const char   ==> const char&
-//   char&        ==> const char&
-//   const char&  ==> const char&
-//
-// The argument T must depend on some template parameters.
-#define GTEST_REFERENCE_TO_CONST_(T) \
-    GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T))
-
-// ImplicitlyConvertible<From, To>::value is a compile-time bool
-// constant that's true iff type From can be implicitly converted to
-// type To.
-template <typename From, typename To>
-class ImplicitlyConvertible {
- private:
-  // We need the following helper functions only for their types.
-  // They have no implementations.
-
-  // MakeFrom() is an expression whose type is From.  We cannot simply
-  // use From(), as the type From may not have a public default
-  // constructor.
-  static From MakeFrom();
-
-  // These two functions are overloaded.  Given an expression
-  // Helper(x), the compiler will pick the first version if x can be
-  // implicitly converted to type To; otherwise it will pick the
-  // second version.
-  //
-  // The first version returns a value of size 1, and the second
-  // version returns a value of size 2.  Therefore, by checking the
-  // size of Helper(x), which can be done at compile time, we can tell
-  // which version of Helper() is used, and hence whether x can be
-  // implicitly converted to type To.
-  static char Helper(To);
-  static char (&Helper(...))[2];  // NOLINT
-
-  // We have to put the 'public' section after the 'private' section,
-  // or MSVC refuses to compile the code.
- public:
-  // MSVC warns about implicitly converting from double to int for
-  // possible loss of data, so we need to temporarily disable the
-  // warning.
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4244)  // Temporarily disables warning 4244.
-
-  static const bool value =
-      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
-# pragma warning(pop)           // Restores the warning state.
-#elif defined(__BORLANDC__)
-  // C++Builder cannot use member overload resolution during template
-  // instantiation.  The simplest workaround is to use its C++0x type traits
-  // functions (C++Builder 2009 and above only).
-  static const bool value = __is_convertible(From, To);
-#else
-  static const bool value =
-      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
-#endif  // _MSV_VER
-};
-template <typename From, typename To>
-const bool ImplicitlyConvertible<From, To>::value;
-
-// IsAProtocolMessage<T>::value is a compile-time bool constant that's
-// true iff T is type ProtocolMessage, proto2::Message, or a subclass
-// of those.
-template <typename T>
-struct IsAProtocolMessage
-    : public bool_constant<
-  ImplicitlyConvertible<const T*, const ::ProtocolMessage*>::value ||
-  ImplicitlyConvertible<const T*, const ::proto2::Message*>::value> {
-};
-
-// When the compiler sees expression IsContainerTest<C>(0), if C is an
-// STL-style container class, the first overload of IsContainerTest
-// will be viable (since both C::iterator* and C::const_iterator* are
-// valid types and NULL can be implicitly converted to them).  It will
-// be picked over the second overload as 'int' is a perfect match for
-// the type of argument 0.  If C::iterator or C::const_iterator is not
-// a valid type, the first overload is not viable, and the second
-// overload will be picked.  Therefore, we can determine whether C is
-// a container class by checking the type of IsContainerTest<C>(0).
-// The value of the expression is insignificant.
-//
-// Note that we look for both C::iterator and C::const_iterator.  The
-// reason is that C++ injects the name of a class as a member of the
-// class itself (e.g. you can refer to class iterator as either
-// 'iterator' or 'iterator::iterator').  If we look for C::iterator
-// only, for example, we would mistakenly think that a class named
-// iterator is an STL container.
-//
-// Also note that the simpler approach of overloading
-// IsContainerTest(typename C::const_iterator*) and
-// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
-typedef int IsContainer;
-template <class C>
-IsContainer IsContainerTest(int /* dummy */,
-                            typename C::iterator* /* it */ = NULL,
-                            typename C::const_iterator* /* const_it */ = NULL) {
-  return 0;
-}
-
-typedef char IsNotContainer;
-template <class C>
-IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
-
-// EnableIf<condition>::type is void when 'Cond' is true, and
-// undefined when 'Cond' is false.  To use SFINAE to make a function
-// overload only apply when a particular expression is true, add
-// "typename EnableIf<expression>::type* = 0" as the last parameter.
-template<bool> struct EnableIf;
-template<> struct EnableIf<true> { typedef void type; };  // NOLINT
-
-// Utilities for native arrays.
-
-// ArrayEq() compares two k-dimensional native arrays using the
-// elements' operator==, where k can be any integer >= 0.  When k is
-// 0, ArrayEq() degenerates into comparing a single pair of values.
-
-template <typename T, typename U>
-bool ArrayEq(const T* lhs, size_t size, const U* rhs);
-
-// This generic version is used when k is 0.
-template <typename T, typename U>
-inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
-
-// This overload is used when k >= 1.
-template <typename T, typename U, size_t N>
-inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
-  return internal::ArrayEq(lhs, N, rhs);
-}
-
-// This helper reduces code bloat.  If we instead put its logic inside
-// the previous ArrayEq() function, arrays with different sizes would
-// lead to different copies of the template code.
-template <typename T, typename U>
-bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
-  for (size_t i = 0; i != size; i++) {
-    if (!internal::ArrayEq(lhs[i], rhs[i]))
-      return false;
-  }
-  return true;
-}
-
-// Finds the first element in the iterator range [begin, end) that
-// equals elem.  Element may be a native array type itself.
-template <typename Iter, typename Element>
-Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
-  for (Iter it = begin; it != end; ++it) {
-    if (internal::ArrayEq(*it, elem))
-      return it;
-  }
-  return end;
-}
-
-// CopyArray() copies a k-dimensional native array using the elements'
-// operator=, where k can be any integer >= 0.  When k is 0,
-// CopyArray() degenerates into copying a single value.
-
-template <typename T, typename U>
-void CopyArray(const T* from, size_t size, U* to);
-
-// This generic version is used when k is 0.
-template <typename T, typename U>
-inline void CopyArray(const T& from, U* to) { *to = from; }
-
-// This overload is used when k >= 1.
-template <typename T, typename U, size_t N>
-inline void CopyArray(const T(&from)[N], U(*to)[N]) {
-  internal::CopyArray(from, N, *to);
-}
-
-// This helper reduces code bloat.  If we instead put its logic inside
-// the previous CopyArray() function, arrays with different sizes
-// would lead to different copies of the template code.
-template <typename T, typename U>
-void CopyArray(const T* from, size_t size, U* to) {
-  for (size_t i = 0; i != size; i++) {
-    internal::CopyArray(from[i], to + i);
-  }
-}
-
-// The relation between an NativeArray object (see below) and the
-// native array it represents.
-enum RelationToSource {
-  kReference,  // The NativeArray references the native array.
-  kCopy        // The NativeArray makes a copy of the native array and
-               // owns the copy.
-};
-
-// Adapts a native array to a read-only STL-style container.  Instead
-// of the complete STL container concept, this adaptor only implements
-// members useful for Google Mock's container matchers.  New members
-// should be added as needed.  To simplify the implementation, we only
-// support Element being a raw type (i.e. having no top-level const or
-// reference modifier).  It's the client's responsibility to satisfy
-// this requirement.  Element can be an array type itself (hence
-// multi-dimensional arrays are supported).
-template <typename Element>
-class NativeArray {
- public:
-  // STL-style container typedefs.
-  typedef Element value_type;
-  typedef Element* iterator;
-  typedef const Element* const_iterator;
-
-  // Constructs from a native array.
-  NativeArray(const Element* array, size_t count, RelationToSource relation) {
-    Init(array, count, relation);
-  }
-
-  // Copy constructor.
-  NativeArray(const NativeArray& rhs) {
-    Init(rhs.array_, rhs.size_, rhs.relation_to_source_);
-  }
-
-  ~NativeArray() {
-    // Ensures that the user doesn't instantiate NativeArray with a
-    // const or reference type.
-    static_cast<void>(StaticAssertTypeEqHelper<Element,
-        GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>());
-    if (relation_to_source_ == kCopy)
-      delete[] array_;
-  }
-
-  // STL-style container methods.
-  size_t size() const { return size_; }
-  const_iterator begin() const { return array_; }
-  const_iterator end() const { return array_ + size_; }
-  bool operator==(const NativeArray& rhs) const {
-    return size() == rhs.size() &&
-        ArrayEq(begin(), size(), rhs.begin());
-  }
-
- private:
-  // Initializes this object; makes a copy of the input array if
-  // 'relation' is kCopy.
-  void Init(const Element* array, size_t a_size, RelationToSource relation) {
-    if (relation == kReference) {
-      array_ = array;
-    } else {
-      Element* const copy = new Element[a_size];
-      CopyArray(array, a_size, copy);
-      array_ = copy;
-    }
-    size_ = a_size;
-    relation_to_source_ = relation;
-  }
-
-  const Element* array_;
-  size_t size_;
-  RelationToSource relation_to_source_;
-
-  GTEST_DISALLOW_ASSIGN_(NativeArray);
-};
-
-}  // namespace internal
-}  // namespace testing
-
-#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
-  ::testing::internal::AssertHelper(result_type, file, line, message) \
-    = ::testing::Message()
-
-#define GTEST_MESSAGE_(message, result_type) \
-  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
-
-#define GTEST_FATAL_FAILURE_(message) \
-  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
-
-#define GTEST_NONFATAL_FAILURE_(message) \
-  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
-
-#define GTEST_SUCCESS_(message) \
-  GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
-
-// Suppresses MSVC warnings 4072 (unreachable code) for the code following
-// statement if it returns or throws (or doesn't return or throw in some
-// situations).
-#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
-  if (::testing::internal::AlwaysTrue()) { statement; }
-
-#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
-    bool gtest_caught_expected = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (expected_exception const&) { \
-      gtest_caught_expected = true; \
-    } \
-    catch (...) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws a different type."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-    if (!gtest_caught_expected) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws nothing."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
-      fail(gtest_msg.value)
-
-#define GTEST_TEST_NO_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
-      fail("Expected: " #statement " doesn't throw an exception.\n" \
-           "  Actual: it throws.")
-
-#define GTEST_TEST_ANY_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    bool gtest_caught_any = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      gtest_caught_any = true; \
-    } \
-    if (!gtest_caught_any) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
-      fail("Expected: " #statement " throws an exception.\n" \
-           "  Actual: it doesn't.")
-
-
-// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
-// either a boolean expression or an AssertionResult. text is a textual
-// represenation of expression as it was passed into the EXPECT_TRUE.
-#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar_ = \
-      ::testing::AssertionResult(expression)) \
-    ; \
-  else \
-    fail(::testing::internal::GetBoolAssertionFailureMessage(\
-        gtest_ar_, text, #actual, #expected).c_str())
-
-#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
-      fail("Expected: " #statement " doesn't generate new fatal " \
-           "failures in the current thread.\n" \
-           "  Actual: it does.")
-
-// Expands to the name of the class that implements the given test.
-#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
-  test_case_name##_##test_name##_Test
-
-// Helper macro for defining tests.
-#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\
-class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\
- public:\
-  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
- private:\
-  virtual void TestBody();\
-  static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(\
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\
-};\
-\
-::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
-  ::test_info_ =\
-    ::testing::internal::MakeAndRegisterTestInfo(\
-        #test_case_name, #test_name, NULL, NULL, \
-        (parent_id), \
-        parent_class::SetUpTestCase, \
-        parent_class::TearDownTestCase, \
-        new ::testing::internal::TestFactoryImpl<\
-            GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
-void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file defines the public API for death tests.  It is
-// #included by gtest.h so a user doesn't need to include this
-// directly.
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file defines internal utilities needed for implementing
-// death tests.  They are subject to change without notice.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-
-
-#include <stdio.h>
-
-namespace testing {
-namespace internal {
-
-GTEST_DECLARE_string_(internal_run_death_test);
-
-// Names of the flags (needed for parsing Google Test flags).
-const char kDeathTestStyleFlag[] = "death_test_style";
-const char kDeathTestUseFork[] = "death_test_use_fork";
-const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
-
-#if GTEST_HAS_DEATH_TEST
-
-// DeathTest is a class that hides much of the complexity of the
-// GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
-// returns a concrete class that depends on the prevailing death test
-// style, as defined by the --gtest_death_test_style and/or
-// --gtest_internal_run_death_test flags.
-
-// In describing the results of death tests, these terms are used with
-// the corresponding definitions:
-//
-// exit status:  The integer exit information in the format specified
-//               by wait(2)
-// exit code:    The integer code passed to exit(3), _exit(2), or
-//               returned from main()
-class GTEST_API_ DeathTest {
- public:
-  // Create returns false if there was an error determining the
-  // appropriate action to take for the current death test; for example,
-  // if the gtest_death_test_style flag is set to an invalid value.
-  // The LastMessage method will return a more detailed message in that
-  // case.  Otherwise, the DeathTest pointer pointed to by the "test"
-  // argument is set.  If the death test should be skipped, the pointer
-  // is set to NULL; otherwise, it is set to the address of a new concrete
-  // DeathTest object that controls the execution of the current test.
-  static bool Create(const char* statement, const RE* regex,
-                     const char* file, int line, DeathTest** test);
-  DeathTest();
-  virtual ~DeathTest() { }
-
-  // A helper class that aborts a death test when it's deleted.
-  class ReturnSentinel {
-   public:
-    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
-    ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
-   private:
-    DeathTest* const test_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
-  } GTEST_ATTRIBUTE_UNUSED_;
-
-  // An enumeration of possible roles that may be taken when a death
-  // test is encountered.  EXECUTE means that the death test logic should
-  // be executed immediately.  OVERSEE means that the program should prepare
-  // the appropriate environment for a child process to execute the death
-  // test, then wait for it to complete.
-  enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
-
-  // An enumeration of the three reasons that a test might be aborted.
-  enum AbortReason {
-    TEST_ENCOUNTERED_RETURN_STATEMENT,
-    TEST_THREW_EXCEPTION,
-    TEST_DID_NOT_DIE
-  };
-
-  // Assumes one of the above roles.
-  virtual TestRole AssumeRole() = 0;
-
-  // Waits for the death test to finish and returns its status.
-  virtual int Wait() = 0;
-
-  // Returns true if the death test passed; that is, the test process
-  // exited during the test, its exit status matches a user-supplied
-  // predicate, and its stderr output matches a user-supplied regular
-  // expression.
-  // The user-supplied predicate may be a macro expression rather
-  // than a function pointer or functor, or else Wait and Passed could
-  // be combined.
-  virtual bool Passed(bool exit_status_ok) = 0;
-
-  // Signals that the death test did not die as expected.
-  virtual void Abort(AbortReason reason) = 0;
-
-  // Returns a human-readable outcome message regarding the outcome of
-  // the last death test.
-  static const char* LastMessage();
-
-  static void set_last_death_test_message(const std::string& message);
-
- private:
-  // A string containing a description of the outcome of the last death test.
-  static std::string last_death_test_message_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
-};
-
-// Factory interface for death tests.  May be mocked out for testing.
-class DeathTestFactory {
- public:
-  virtual ~DeathTestFactory() { }
-  virtual bool Create(const char* statement, const RE* regex,
-                      const char* file, int line, DeathTest** test) = 0;
-};
-
-// A concrete DeathTestFactory implementation for normal use.
-class DefaultDeathTestFactory : public DeathTestFactory {
- public:
-  virtual bool Create(const char* statement, const RE* regex,
-                      const char* file, int line, DeathTest** test);
-};
-
-// Returns true if exit_status describes a process that was terminated
-// by a signal, or exited normally with a nonzero exit code.
-GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
-
-// Traps C++ exceptions escaping statement and reports them as test
-// failures. Note that trapping SEH exceptions is not implemented here.
-# if GTEST_HAS_EXCEPTIONS
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
-  try { \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } catch (const ::std::exception& gtest_exception) { \
-    fprintf(\
-        stderr, \
-        "\n%s: Caught std::exception-derived exception escaping the " \
-        "death test statement. Exception message: %s\n", \
-        ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
-        gtest_exception.what()); \
-    fflush(stderr); \
-    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
-  } catch (...) { \
-    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
-  }
-
-# else
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
-  GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
-
-# endif
-
-// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
-// ASSERT_EXIT*, and EXPECT_EXIT*.
-# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    const ::testing::internal::RE& gtest_regex = (regex); \
-    ::testing::internal::DeathTest* gtest_dt; \
-    if (!::testing::internal::DeathTest::Create(#statement, &gtest_regex, \
-        __FILE__, __LINE__, &gtest_dt)) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
-    } \
-    if (gtest_dt != NULL) { \
-      ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \
-          gtest_dt_ptr(gtest_dt); \
-      switch (gtest_dt->AssumeRole()) { \
-        case ::testing::internal::DeathTest::OVERSEE_TEST: \
-          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \
-            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
-          } \
-          break; \
-        case ::testing::internal::DeathTest::EXECUTE_TEST: { \
-          ::testing::internal::DeathTest::ReturnSentinel \
-              gtest_sentinel(gtest_dt); \
-          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \
-          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \
-          break; \
-        } \
-        default: \
-          break; \
-      } \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \
-      fail(::testing::internal::DeathTest::LastMessage())
-// The symbol "fail" here expands to something into which a message
-// can be streamed.
-
-// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
-// NDEBUG mode. In this case we need the statements to be executed, the regex is
-// ignored, and the macro must accept a streamed message even though the message
-// is never printed.
-# define GTEST_EXECUTE_STATEMENT_(statement, regex) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-     GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } else \
-    ::testing::Message()
-
-// A class representing the parsed contents of the
-// --gtest_internal_run_death_test flag, as it existed when
-// RUN_ALL_TESTS was called.
-class InternalRunDeathTestFlag {
- public:
-  InternalRunDeathTestFlag(const std::string& a_file,
-                           int a_line,
-                           int an_index,
-                           int a_write_fd)
-      : file_(a_file), line_(a_line), index_(an_index),
-        write_fd_(a_write_fd) {}
-
-  ~InternalRunDeathTestFlag() {
-    if (write_fd_ >= 0)
-      posix::Close(write_fd_);
-  }
-
-  const std::string& file() const { return file_; }
-  int line() const { return line_; }
-  int index() const { return index_; }
-  int write_fd() const { return write_fd_; }
-
- private:
-  std::string file_;
-  int line_;
-  int index_;
-  int write_fd_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
-};
-
-// Returns a newly created InternalRunDeathTestFlag object with fields
-// initialized from the GTEST_FLAG(internal_run_death_test) flag if
-// the flag is specified; otherwise returns NULL.
-InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
-
-#else  // GTEST_HAS_DEATH_TEST
-
-// This macro is used for implementing macros such as
-// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
-// death tests are not supported. Those macros must compile on such systems
-// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
-// systems that support death tests. This allows one to write such a macro
-// on a system that does not support death tests and be sure that it will
-// compile on a death-test supporting system.
-//
-// Parameters:
-//   statement -  A statement that a macro such as EXPECT_DEATH would test
-//                for program termination. This macro has to make sure this
-//                statement is compiled but not executed, to ensure that
-//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
-//                parameter iff EXPECT_DEATH compiles with it.
-//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
-//                the output of statement.  This parameter has to be
-//                compiled but not evaluated by this macro, to ensure that
-//                this macro only accepts expressions that a macro such as
-//                EXPECT_DEATH would accept.
-//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
-//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
-//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
-//                compile inside functions where ASSERT_DEATH doesn't
-//                compile.
-//
-//  The branch that has an always false condition is used to ensure that
-//  statement and regex are compiled (and thus syntactically correct) but
-//  never executed. The unreachable code macro protects the terminator
-//  statement from generating an 'unreachable code' warning in case
-//  statement unconditionally returns or throws. The Message constructor at
-//  the end allows the syntax of streaming additional messages into the
-//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
-# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::AlwaysTrue()) { \
-      GTEST_LOG_(WARNING) \
-          << "Death tests are not supported on this platform.\n" \
-          << "Statement '" #statement "' cannot be verified."; \
-    } else if (::testing::internal::AlwaysFalse()) { \
-      ::testing::internal::RE::PartialMatch(".*", (regex)); \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-      terminator; \
-    } else \
-      ::testing::Message()
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-
-namespace testing {
-
-// This flag controls the style of death tests.  Valid values are "threadsafe",
-// meaning that the death test child process will re-execute the test binary
-// from the start, running only a single death test, or "fast",
-// meaning that the child process will execute the test logic immediately
-// after forking.
-GTEST_DECLARE_string_(death_test_style);
-
-#if GTEST_HAS_DEATH_TEST
-
-namespace internal {
-
-// Returns a Boolean value indicating whether the caller is currently
-// executing in the context of the death test child process.  Tools such as
-// Valgrind heap checkers may need this to modify their behavior in death
-// tests.  IMPORTANT: This is an internal utility.  Using it may break the
-// implementation of death tests.  User code MUST NOT use it.
-GTEST_API_ bool InDeathTestChild();
-
-}  // namespace internal
-
-// The following macros are useful for writing death tests.
-
-// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
-// executed:
-//
-//   1. It generates a warning if there is more than one active
-//   thread.  This is because it's safe to fork() or clone() only
-//   when there is a single thread.
-//
-//   2. The parent process clone()s a sub-process and runs the death
-//   test in it; the sub-process exits with code 0 at the end of the
-//   death test, if it hasn't exited already.
-//
-//   3. The parent process waits for the sub-process to terminate.
-//
-//   4. The parent process checks the exit code and error message of
-//   the sub-process.
-//
-// Examples:
-//
-//   ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
-//   for (int i = 0; i < 5; i++) {
-//     EXPECT_DEATH(server.ProcessRequest(i),
-//                  "Invalid request .* in ProcessRequest()")
-//                  << "Failed to die on request " << i;
-//   }
-//
-//   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
-//
-//   bool KilledBySIGHUP(int exit_code) {
-//     return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
-//   }
-//
-//   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
-//
-// On the regular expressions used in death tests:
-//
-//   On POSIX-compliant systems (*nix), we use the <regex.h> library,
-//   which uses the POSIX extended regex syntax.
-//
-//   On other platforms (e.g. Windows), we only support a simple regex
-//   syntax implemented as part of Google Test.  This limited
-//   implementation should be enough most of the time when writing
-//   death tests; though it lacks many features you can find in PCRE
-//   or POSIX extended regex syntax.  For example, we don't support
-//   union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
-//   repetition count ("x{5,7}"), among others.
-//
-//   Below is the syntax that we do support.  We chose it to be a
-//   subset of both PCRE and POSIX extended regex, so it's easy to
-//   learn wherever you come from.  In the following: 'A' denotes a
-//   literal character, period (.), or a single \\ escape sequence;
-//   'x' and 'y' denote regular expressions; 'm' and 'n' are for
-//   natural numbers.
-//
-//     c     matches any literal character c
-//     \\d   matches any decimal digit
-//     \\D   matches any character that's not a decimal digit
-//     \\f   matches \f
-//     \\n   matches \n
-//     \\r   matches \r
-//     \\s   matches any ASCII whitespace, including \n
-//     \\S   matches any character that's not a whitespace
-//     \\t   matches \t
-//     \\v   matches \v
-//     \\w   matches any letter, _, or decimal digit
-//     \\W   matches any character that \\w doesn't match
-//     \\c   matches any literal character c, which must be a punctuation
-//     .     matches any single character except \n
-//     A?    matches 0 or 1 occurrences of A
-//     A*    matches 0 or many occurrences of A
-//     A+    matches 1 or many occurrences of A
-//     ^     matches the beginning of a string (not that of each line)
-//     $     matches the end of a string (not that of each line)
-//     xy    matches x followed by y
-//
-//   If you accidentally use PCRE or POSIX extended regex features
-//   not implemented by us, you will get a run-time failure.  In that
-//   case, please try to rewrite your regular expression within the
-//   above syntax.
-//
-//   This implementation is *not* meant to be as highly tuned or robust
-//   as a compiled regex library, but should perform well enough for a
-//   death test, which already incurs significant overhead by launching
-//   a child process.
-//
-// Known caveats:
-//
-//   A "threadsafe" style death test obtains the path to the test
-//   program from argv[0] and re-executes it in the sub-process.  For
-//   simplicity, the current implementation doesn't search the PATH
-//   when launching the sub-process.  This means that the user must
-//   invoke the test program via a path that contains at least one
-//   path separator (e.g. path/to/foo_test and
-//   /absolute/path/to/bar_test are fine, but foo_test is not).  This
-//   is rarely a problem as people usually don't put the test binary
-//   directory in PATH.
-//
-// TODO(wan@google.com): make thread-safe death tests search the PATH.
-
-// Asserts that a given statement causes the program to exit, with an
-// integer exit status that satisfies predicate, and emitting error output
-// that matches regex.
-# define ASSERT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
-
-// Like ASSERT_EXIT, but continues on to successive tests in the
-// test case, if any:
-# define EXPECT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
-
-// Asserts that a given statement causes the program to exit, either by
-// explicitly exiting with a nonzero exit code or being killed by a
-// signal, and emitting error output that matches regex.
-# define ASSERT_DEATH(statement, regex) \
-    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
-
-// Like ASSERT_DEATH, but continues on to successive tests in the
-// test case, if any:
-# define EXPECT_DEATH(statement, regex) \
-    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
-
-// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
-
-// Tests that an exit code describes a normal exit with a given exit code.
-class GTEST_API_ ExitedWithCode {
- public:
-  explicit ExitedWithCode(int exit_code);
-  bool operator()(int exit_status) const;
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ExitedWithCode& other);
-
-  const int exit_code_;
-};
-
-# if !GTEST_OS_WINDOWS
-// Tests that an exit code describes an exit due to termination by a
-// given signal.
-class GTEST_API_ KilledBySignal {
- public:
-  explicit KilledBySignal(int signum);
-  bool operator()(int exit_status) const;
- private:
-  const int signum_;
-};
-# endif  // !GTEST_OS_WINDOWS
-
-// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
-// The death testing framework causes this to have interesting semantics,
-// since the sideeffects of the call are only visible in opt mode, and not
-// in debug mode.
-//
-// In practice, this can be used to test functions that utilize the
-// LOG(DFATAL) macro using the following style:
-//
-// int DieInDebugOr12(int* sideeffect) {
-//   if (sideeffect) {
-//     *sideeffect = 12;
-//   }
-//   LOG(DFATAL) << "death";
-//   return 12;
-// }
-//
-// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) {
-//   int sideeffect = 0;
-//   // Only asserts in dbg.
-//   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
-//
-// #ifdef NDEBUG
-//   // opt-mode has sideeffect visible.
-//   EXPECT_EQ(12, sideeffect);
-// #else
-//   // dbg-mode no visible sideeffect.
-//   EXPECT_EQ(0, sideeffect);
-// #endif
-// }
-//
-// This will assert that DieInDebugReturn12InOpt() crashes in debug
-// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
-// appropriate fallback value (12 in this case) in opt mode. If you
-// need to test that a function has appropriate side-effects in opt
-// mode, include assertions against the side-effects.  A general
-// pattern for this is:
-//
-// EXPECT_DEBUG_DEATH({
-//   // Side-effects here will have an effect after this statement in
-//   // opt mode, but none in debug mode.
-//   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
-// }, "death");
-//
-# ifdef NDEBUG
-
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
-  GTEST_EXECUTE_STATEMENT_(statement, regex)
-
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
-  GTEST_EXECUTE_STATEMENT_(statement, regex)
-
-# else
-
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
-  EXPECT_DEATH(statement, regex)
-
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
-  ASSERT_DEATH(statement, regex)
-
-# endif  // NDEBUG for EXPECT_DEBUG_DEATH
-#endif  // GTEST_HAS_DEATH_TEST
-
-// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
-// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
-// death tests are supported; otherwise they just issue a warning.  This is
-// useful when you are combining death test assertions with normal test
-// assertions in one test.
-#if GTEST_HAS_DEATH_TEST
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    EXPECT_DEATH(statement, regex)
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    ASSERT_DEATH(statement, regex)
-#else
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
-#endif
-
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-// This file was GENERATED by command:
-//     pump.py gtest-param-test.h.pump
-// DO NOT EDIT BY HAND!!!
-
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: vladl@google.com (Vlad Losev)
-//
-// Macros and functions for implementing parameterized tests
-// in Google C++ Testing Framework (Google Test)
-//
-// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-
-
-// Value-parameterized tests allow you to test your code with different
-// parameters without writing multiple copies of the same test.
-//
-// Here is how you use value-parameterized tests:
-
-#if 0
-
-// To write value-parameterized tests, first you should define a fixture
-// class. It is usually derived from testing::TestWithParam<T> (see below for
-// another inheritance scheme that's sometimes useful in more complicated
-// class hierarchies), where the type of your parameter values.
-// TestWithParam<T> is itself derived from testing::Test. T can be any
-// copyable type. If it's a raw pointer, you are responsible for managing the
-// lifespan of the pointed values.
-
-class FooTest : public ::testing::TestWithParam<const char*> {
-  // You can implement all the usual class fixture members here.
-};
-
-// Then, use the TEST_P macro to define as many parameterized tests
-// for this fixture as you want. The _P suffix is for "parameterized"
-// or "pattern", whichever you prefer to think.
-
-TEST_P(FooTest, DoesBlah) {
-  // Inside a test, access the test parameter with the GetParam() method
-  // of the TestWithParam<T> class:
-  EXPECT_TRUE(foo.Blah(GetParam()));
-  ...
-}
-
-TEST_P(FooTest, HasBlahBlah) {
-  ...
-}
-
-// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
-// case with any set of parameters you want. Google Test defines a number
-// of functions for generating test parameters. They return what we call
-// (surprise!) parameter generators. Here is a  summary of them, which
-// are all in the testing namespace:
-//
-//
-//  Range(begin, end [, step]) - Yields values {begin, begin+step,
-//                               begin+step+step, ...}. The values do not
-//                               include end. step defaults to 1.
-//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
-//  ValuesIn(container)        - Yields values from a C-style array, an STL
-//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
-//  Bool()                     - Yields sequence {false, true}.
-//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
-//                               for the math savvy) of the values generated
-//                               by the N generators.
-//
-// For more details, see comments at the definitions of these functions below
-// in this file.
-//
-// The following statement will instantiate tests from the FooTest test case
-// each with parameter values "meeny", "miny", and "moe".
-
-INSTANTIATE_TEST_CASE_P(InstantiationName,
-                        FooTest,
-                        Values("meeny", "miny", "moe"));
-
-// To distinguish different instances of the pattern, (yes, you
-// can instantiate it more then once) the first argument to the
-// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the
-// actual test case name. Remember to pick unique prefixes for different
-// instantiations. The tests from the instantiation above will have
-// these names:
-//
-//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
-//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
-//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
-//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
-//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
-//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
-//
-// You can use these names in --gtest_filter.
-//
-// This statement will instantiate all tests from FooTest again, each
-// with parameter values "cat" and "dog":
-
-const char* pets[] = {"cat", "dog"};
-INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
-
-// The tests from the instantiation above will have these names:
-//
-//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
-//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
-//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
-//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
-//
-// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests
-// in the given test case, whether their definitions come before or
-// AFTER the INSTANTIATE_TEST_CASE_P statement.
-//
-// Please also note that generator expressions (including parameters to the
-// generators) are evaluated in InitGoogleTest(), after main() has started.
-// This allows the user on one hand, to adjust generator parameters in order
-// to dynamically determine a set of tests to run and on the other hand,
-// give the user a chance to inspect the generated tests with Google Test
-// reflection API before RUN_ALL_TESTS() is executed.
-//
-// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
-// for more examples.
-//
-// In the future, we plan to publish the API for defining new parameter
-// generators. But for now this interface remains part of the internal
-// implementation and is subject to change.
-//
-//
-// A parameterized test fixture must be derived from testing::Test and from
-// testing::WithParamInterface<T>, where T is the type of the parameter
-// values. Inheriting from TestWithParam<T> satisfies that requirement because
-// TestWithParam<T> inherits from both Test and WithParamInterface. In more
-// complicated hierarchies, however, it is occasionally useful to inherit
-// separately from Test and WithParamInterface. For example:
-
-class BaseTest : public ::testing::Test {
-  // You can inherit all the usual members for a non-parameterized test
-  // fixture here.
-};
-
-class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
-  // The usual test fixture members go here too.
-};
-
-TEST_F(BaseTest, HasFoo) {
-  // This is an ordinary non-parameterized test.
-}
-
-TEST_P(DerivedTest, DoesBlah) {
-  // GetParam works just the same here as if you inherit from TestWithParam.
-  EXPECT_TRUE(foo.Blah(GetParam()));
-}
-
-#endif  // 0
-
-
-#if !GTEST_OS_SYMBIAN
-# include <utility>
-#endif
-
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
-
-// Type and function utilities for implementing parameterized tests.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-
-#include <iterator>
-#include <utility>
-#include <vector>
-
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
-// Copyright 2003 Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: Dan Egnor (egnor@google.com)
-//
-// A "smart" pointer type with reference tracking.  Every pointer to a
-// particular object is kept on a circular linked list.  When the last pointer
-// to an object is destroyed or reassigned, the object is deleted.
-//
-// Used properly, this deletes the object when the last reference goes away.
-// There are several caveats:
-// - Like all reference counting schemes, cycles lead to leaks.
-// - Each smart pointer is actually two pointers (8 bytes instead of 4).
-// - Every time a pointer is assigned, the entire list of pointers to that
-//   object is traversed.  This class is therefore NOT SUITABLE when there
-//   will often be more than two or three pointers to a particular object.
-// - References are only tracked as long as linked_ptr<> objects are copied.
-//   If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS
-//   will happen (double deletion).
-//
-// A good use of this class is storing object references in STL containers.
-// You can safely put linked_ptr<> in a vector<>.
-// Other uses may not be as good.
-//
-// Note: If you use an incomplete type with linked_ptr<>, the class
-// *containing* linked_ptr<> must have a constructor and destructor (even
-// if they do nothing!).
-//
-// Bill Gibbons suggested we use something like this.
-//
-// Thread Safety:
-//   Unlike other linked_ptr implementations, in this implementation
-//   a linked_ptr object is thread-safe in the sense that:
-//     - it's safe to copy linked_ptr objects concurrently,
-//     - it's safe to copy *from* a linked_ptr and read its underlying
-//       raw pointer (e.g. via get()) concurrently, and
-//     - it's safe to write to two linked_ptrs that point to the same
-//       shared object concurrently.
-// TODO(wan@google.com): rename this to safe_linked_ptr to avoid
-// confusion with normal linked_ptr.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
-
-#include <stdlib.h>
-#include <assert.h>
-
-
-namespace testing {
-namespace internal {
-
-// Protects copying of all linked_ptr objects.
-GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex);
-
-// This is used internally by all instances of linked_ptr<>.  It needs to be
-// a non-template class because different types of linked_ptr<> can refer to
-// the same object (linked_ptr<Superclass>(obj) vs linked_ptr<Subclass>(obj)).
-// So, it needs to be possible for different types of linked_ptr to participate
-// in the same circular linked list, so we need a single class type here.
-//
-// DO NOT USE THIS CLASS DIRECTLY YOURSELF.  Use linked_ptr<T>.
-class linked_ptr_internal {
- public:
-  // Create a new circle that includes only this instance.
-  void join_new() {
-    next_ = this;
-  }
-
-  // Many linked_ptr operations may change p.link_ for some linked_ptr
-  // variable p in the same circle as this object.  Therefore we need
-  // to prevent two such operations from occurring concurrently.
-  //
-  // Note that different types of linked_ptr objects can coexist in a
-  // circle (e.g. linked_ptr<Base>, linked_ptr<Derived1>, and
-  // linked_ptr<Derived2>).  Therefore we must use a single mutex to
-  // protect all linked_ptr objects.  This can create serious
-  // contention in production code, but is acceptable in a testing
-  // framework.
-
-  // Join an existing circle.
-  void join(linked_ptr_internal const* ptr)
-      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
-    MutexLock lock(&g_linked_ptr_mutex);
-
-    linked_ptr_internal const* p = ptr;
-    while (p->next_ != ptr) p = p->next_;
-    p->next_ = this;
-    next_ = ptr;
-  }
-
-  // Leave whatever circle we're part of.  Returns true if we were the
-  // last member of the circle.  Once this is done, you can join() another.
-  bool depart()
-      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
-    MutexLock lock(&g_linked_ptr_mutex);
-
-    if (next_ == this) return true;
-    linked_ptr_internal const* p = next_;
-    while (p->next_ != this) p = p->next_;
-    p->next_ = next_;
-    return false;
-  }
-
- private:
-  mutable linked_ptr_internal const* next_;
-};
-
-template <typename T>
-class linked_ptr {
- public:
-  typedef T element_type;
-
-  // Take over ownership of a raw pointer.  This should happen as soon as
-  // possible after the object is created.
-  explicit linked_ptr(T* ptr = NULL) { capture(ptr); }
-  ~linked_ptr() { depart(); }
-
-  // Copy an existing linked_ptr<>, adding ourselves to the list of references.
-  template <typename U> linked_ptr(linked_ptr<U> const& ptr) { copy(&ptr); }
-  linked_ptr(linked_ptr const& ptr) {  // NOLINT
-    assert(&ptr != this);
-    copy(&ptr);
-  }
-
-  // Assignment releases the old value and acquires the new.
-  template <typename U> linked_ptr& operator=(linked_ptr<U> const& ptr) {
-    depart();
-    copy(&ptr);
-    return *this;
-  }
-
-  linked_ptr& operator=(linked_ptr const& ptr) {
-    if (&ptr != this) {
-      depart();
-      copy(&ptr);
-    }
-    return *this;
-  }
-
-  // Smart pointer members.
-  void reset(T* ptr = NULL) {
-    depart();
-    capture(ptr);
-  }
-  T* get() const { return value_; }
-  T* operator->() const { return value_; }
-  T& operator*() const { return *value_; }
-
-  bool operator==(T* p) const { return value_ == p; }
-  bool operator!=(T* p) const { return value_ != p; }
-  template <typename U>
-  bool operator==(linked_ptr<U> const& ptr) const {
-    return value_ == ptr.get();
-  }
-  template <typename U>
-  bool operator!=(linked_ptr<U> const& ptr) const {
-    return value_ != ptr.get();
-  }
-
- private:
-  template <typename U>
-  friend class linked_ptr;
-
-  T* value_;
-  linked_ptr_internal link_;
-
-  void depart() {
-    if (link_.depart()) delete value_;
-  }
-
-  void capture(T* ptr) {
-    value_ = ptr;
-    link_.join_new();
-  }
-
-  template <typename U> void copy(linked_ptr<U> const* ptr) {
-    value_ = ptr->get();
-    if (value_)
-      link_.join(&ptr->link_);
-    else
-      link_.join_new();
-  }
-};
-
-template<typename T> inline
-bool operator==(T* ptr, const linked_ptr<T>& x) {
-  return ptr == x.get();
-}
-
-template<typename T> inline
-bool operator!=(T* ptr, const linked_ptr<T>& x) {
-  return ptr != x.get();
-}
-
-// A function to convert T* into linked_ptr<T>
-// Doing e.g. make_linked_ptr(new FooBarBaz<type>(arg)) is a shorter notation
-// for linked_ptr<FooBarBaz<type> >(new FooBarBaz<type>(arg))
-template <typename T>
-linked_ptr<T> make_linked_ptr(T* ptr) {
-  return linked_ptr<T>(ptr);
-}
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Google Test - The Google C++ Testing Framework
-//
-// This file implements a universal value printer that can print a
-// value of any type T:
-//
-//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
-//
-// A user can teach this function how to print a class type T by
-// defining either operator<<() or PrintTo() in the namespace that
-// defines T.  More specifically, the FIRST defined function in the
-// following list will be used (assuming T is defined in namespace
-// foo):
-//
-//   1. foo::PrintTo(const T&, ostream*)
-//   2. operator<<(ostream&, const T&) defined in either foo or the
-//      global namespace.
-//
-// If none of the above is defined, it will print the debug string of
-// the value if it is a protocol buffer, or print the raw bytes in the
-// value otherwise.
-//
-// To aid debugging: when T is a reference type, the address of the
-// value is also printed; when T is a (const) char pointer, both the
-// pointer value and the NUL-terminated string it points to are
-// printed.
-//
-// We also provide some convenient wrappers:
-//
-//   // Prints a value to a string.  For a (const or not) char
-//   // pointer, the NUL-terminated string (but not the pointer) is
-//   // printed.
-//   std::string ::testing::PrintToString(const T& value);
-//
-//   // Prints a value tersely: for a reference type, the referenced
-//   // value (but not the address) is printed; for a (const or not) char
-//   // pointer, the NUL-terminated string (but not the pointer) is
-//   // printed.
-//   void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
-//
-//   // Prints value using the type inferred by the compiler.  The difference
-//   // from UniversalTersePrint() is that this function prints both the
-//   // pointer and the NUL-terminated string for a (const or not) char pointer.
-//   void ::testing::internal::UniversalPrint(const T& value, ostream*);
-//
-//   // Prints the fields of a tuple tersely to a string vector, one
-//   // element for each field. Tuple support must be enabled in
-//   // gtest-port.h.
-//   std::vector<string> UniversalTersePrintTupleFieldsToStrings(
-//       const Tuple& value);
-//
-// Known limitation:
-//
-// The print primitives print the elements of an STL-style container
-// using the compiler-inferred type of *iter where iter is a
-// const_iterator of the container.  When const_iterator is an input
-// iterator but not a forward iterator, this inferred type may not
-// match value_type, and the print output may be incorrect.  In
-// practice, this is rarely a problem as for most containers
-// const_iterator is a forward iterator.  We'll fix this if there's an
-// actual need for it.  Note that this fix cannot rely on value_type
-// being defined as many user-defined container types don't have
-// value_type.
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-
-#include <ostream>  // NOLINT
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace testing {
-
-// Definitions in the 'internal' and 'internal2' name spaces are
-// subject to change without notice.  DO NOT USE THEM IN USER CODE!
-namespace internal2 {
-
-// Prints the given number of bytes in the given object to the given
-// ostream.
-GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
-                                     size_t count,
-                                     ::std::ostream* os);
-
-// For selecting which printer to use when a given type has neither <<
-// nor PrintTo().
-enum TypeKind {
-  kProtobuf,              // a protobuf type
-  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
-                          // (e.g. a named or unnamed enum type)
-  kOtherType              // anything else
-};
-
-// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
-// by the universal printer to print a value of type T when neither
-// operator<< nor PrintTo() is defined for T, where kTypeKind is the
-// "kind" of T as defined by enum TypeKind.
-template <typename T, TypeKind kTypeKind>
-class TypeWithoutFormatter {
- public:
-  // This default version is called when kTypeKind is kOtherType.
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    PrintBytesInObjectTo(reinterpret_cast<const unsigned char*>(&value),
-                         sizeof(value), os);
-  }
-};
-
-// We print a protobuf using its ShortDebugString() when the string
-// doesn't exceed this many characters; otherwise we print it using
-// DebugString() for better readability.
-const size_t kProtobufOneLinerMaxLength = 50;
-
-template <typename T>
-class TypeWithoutFormatter<T, kProtobuf> {
- public:
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    const ::testing::internal::string short_str = value.ShortDebugString();
-    const ::testing::internal::string pretty_str =
-        short_str.length() <= kProtobufOneLinerMaxLength ?
-        short_str : ("\n" + value.DebugString());
-    *os << ("<" + pretty_str + ">");
-  }
-};
-
-template <typename T>
-class TypeWithoutFormatter<T, kConvertibleToInteger> {
- public:
-  // Since T has no << operator or PrintTo() but can be implicitly
-  // converted to BiggestInt, we print it as a BiggestInt.
-  //
-  // Most likely T is an enum type (either named or unnamed), in which
-  // case printing it as an integer is the desired behavior.  In case
-  // T is not an enum, printing it as an integer is the best we can do
-  // given that it has no user-defined printer.
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    const internal::BiggestInt kBigInt = value;
-    *os << kBigInt;
-  }
-};
-
-// Prints the given value to the given ostream.  If the value is a
-// protocol message, its debug string is printed; if it's an enum or
-// of a type implicitly convertible to BiggestInt, it's printed as an
-// integer; otherwise the bytes in the value are printed.  This is
-// what UniversalPrinter<T>::Print() does when it knows nothing about
-// type T and T has neither << operator nor PrintTo().
-//
-// A user can override this behavior for a class type Foo by defining
-// a << operator in the namespace where Foo is defined.
-//
-// We put this operator in namespace 'internal2' instead of 'internal'
-// to simplify the implementation, as much code in 'internal' needs to
-// use << in STL, which would conflict with our own << were it defined
-// in 'internal'.
-//
-// Note that this operator<< takes a generic std::basic_ostream<Char,
-// CharTraits> type instead of the more restricted std::ostream.  If
-// we define it to take an std::ostream instead, we'll get an
-// "ambiguous overloads" compiler error when trying to print a type
-// Foo that supports streaming to std::basic_ostream<Char,
-// CharTraits>, as the compiler cannot tell whether
-// operator<<(std::ostream&, const T&) or
-// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
-// specific.
-template <typename Char, typename CharTraits, typename T>
-::std::basic_ostream<Char, CharTraits>& operator<<(
-    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
-  TypeWithoutFormatter<T,
-      (internal::IsAProtocolMessage<T>::value ? kProtobuf :
-       internal::ImplicitlyConvertible<const T&, internal::BiggestInt>::value ?
-       kConvertibleToInteger : kOtherType)>::PrintValue(x, &os);
-  return os;
-}
-
-}  // namespace internal2
-}  // namespace testing
-
-// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
-// magic needed for implementing UniversalPrinter won't work.
-namespace testing_internal {
-
-// Used to print a value that is not an STL-style container when the
-// user doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
-  // With the following statement, during unqualified name lookup,
-  // testing::internal2::operator<< appears as if it was declared in
-  // the nearest enclosing namespace that contains both
-  // ::testing_internal and ::testing::internal2, i.e. the global
-  // namespace.  For more details, refer to the C++ Standard section
-  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
-  // testing::internal2::operator<< in case T doesn't come with a <<
-  // operator.
-  //
-  // We cannot write 'using ::testing::internal2::operator<<;', which
-  // gcc 3.3 fails to compile due to a compiler bug.
-  using namespace ::testing::internal2;  // NOLINT
-
-  // Assuming T is defined in namespace foo, in the next statement,
-  // the compiler will consider all of:
-  //
-  //   1. foo::operator<< (thanks to Koenig look-up),
-  //   2. ::operator<< (as the current namespace is enclosed in ::),
-  //   3. testing::internal2::operator<< (thanks to the using statement above).
-  //
-  // The operator<< whose type matches T best will be picked.
-  //
-  // We deliberately allow #2 to be a candidate, as sometimes it's
-  // impossible to define #1 (e.g. when foo is ::std, defining
-  // anything in it is undefined behavior unless you are a compiler
-  // vendor.).
-  *os << value;
-}
-
-}  // namespace testing_internal
-
-namespace testing {
-namespace internal {
-
-// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
-// value to the given ostream.  The caller must ensure that
-// 'ostream_ptr' is not NULL, or the behavior is undefined.
-//
-// We define UniversalPrinter as a class template (as opposed to a
-// function template), as we need to partially specialize it for
-// reference types, which cannot be done with function templates.
-template <typename T>
-class UniversalPrinter;
-
-template <typename T>
-void UniversalPrint(const T& value, ::std::ostream* os);
-
-// Used to print an STL-style container when the user doesn't define
-// a PrintTo() for it.
-template <typename C>
-void DefaultPrintTo(IsContainer /* dummy */,
-                    false_type /* is not a pointer */,
-                    const C& container, ::std::ostream* os) {
-  const size_t kMaxCount = 32;  // The maximum number of elements to print.
-  *os << '{';
-  size_t count = 0;
-  for (typename C::const_iterator it = container.begin();
-       it != container.end(); ++it, ++count) {
-    if (count > 0) {
-      *os << ',';
-      if (count == kMaxCount) {  // Enough has been printed.
-        *os << " ...";
-        break;
-      }
-    }
-    *os << ' ';
-    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
-    // handle *it being a native array.
-    internal::UniversalPrint(*it, os);
-  }
-
-  if (count > 0) {
-    *os << ' ';
-  }
-  *os << '}';
-}
-
-// Used to print a pointer that is neither a char pointer nor a member
-// pointer, when the user doesn't define PrintTo() for it.  (A member
-// variable pointer or member function pointer doesn't really point to
-// a location in the address space.  Their representation is
-// implementation-defined.  Therefore they will be printed as raw
-// bytes.)
-template <typename T>
-void DefaultPrintTo(IsNotContainer /* dummy */,
-                    true_type /* is a pointer */,
-                    T* p, ::std::ostream* os) {
-  if (p == NULL) {
-    *os << "NULL";
-  } else {
-    // C++ doesn't allow casting from a function pointer to any object
-    // pointer.
-    //
-    // IsTrue() silences warnings: "Condition is always true",
-    // "unreachable code".
-    if (IsTrue(ImplicitlyConvertible<T*, const void*>::value)) {
-      // T is not a function type.  We just call << to print p,
-      // relying on ADL to pick up user-defined << for their pointer
-      // types, if any.
-      *os << p;
-    } else {
-      // T is a function type, so '*os << p' doesn't do what we want
-      // (it just prints p as bool).  We want to print p as a const
-      // void*.  However, we cannot cast it to const void* directly,
-      // even using reinterpret_cast, as earlier versions of gcc
-      // (e.g. 3.4.5) cannot compile the cast when p is a function
-      // pointer.  Casting to UInt64 first solves the problem.
-      *os << reinterpret_cast<const void*>(
-          reinterpret_cast<internal::UInt64>(p));
-    }
-  }
-}
-
-// Used to print a non-container, non-pointer value when the user
-// doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintTo(IsNotContainer /* dummy */,
-                    false_type /* is not a pointer */,
-                    const T& value, ::std::ostream* os) {
-  ::testing_internal::DefaultPrintNonContainerTo(value, os);
-}
-
-// Prints the given value using the << operator if it has one;
-// otherwise prints the bytes in it.  This is what
-// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
-// or overloaded for type T.
-//
-// A user can override this behavior for a class type Foo by defining
-// an overload of PrintTo() in the namespace where Foo is defined.  We
-// give the user this option as sometimes defining a << operator for
-// Foo is not desirable (e.g. the coding style may prevent doing it,
-// or there is already a << operator but it doesn't do what the user
-// wants).
-template <typename T>
-void PrintTo(const T& value, ::std::ostream* os) {
-  // DefaultPrintTo() is overloaded.  The type of its first two
-  // arguments determine which version will be picked.  If T is an
-  // STL-style container, the version for container will be called; if
-  // T is a pointer, the pointer version will be called; otherwise the
-  // generic version will be called.
-  //
-  // Note that we check for container types here, prior to we check
-  // for protocol message types in our operator<<.  The rationale is:
-  //
-  // For protocol messages, we want to give people a chance to
-  // override Google Mock's format by defining a PrintTo() or
-  // operator<<.  For STL containers, other formats can be
-  // incompatible with Google Mock's format for the container
-  // elements; therefore we check for container types here to ensure
-  // that our format is used.
-  //
-  // The second argument of DefaultPrintTo() is needed to bypass a bug
-  // in Symbian's C++ compiler that prevents it from picking the right
-  // overload between:
-  //
-  //   PrintTo(const T& x, ...);
-  //   PrintTo(T* x, ...);
-  DefaultPrintTo(IsContainerTest<T>(0), is_pointer<T>(), value, os);
-}
-
-// The following list of PrintTo() overloads tells
-// UniversalPrinter<T>::Print() how to print standard types (built-in
-// types, strings, plain arrays, and pointers).
-
-// Overloads for various char types.
-GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
-GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
-inline void PrintTo(char c, ::std::ostream* os) {
-  // When printing a plain char, we always treat it as unsigned.  This
-  // way, the output won't be affected by whether the compiler thinks
-  // char is signed or not.
-  PrintTo(static_cast<unsigned char>(c), os);
-}
-
-// Overloads for other simple built-in types.
-inline void PrintTo(bool x, ::std::ostream* os) {
-  *os << (x ? "true" : "false");
-}
-
-// Overload for wchar_t type.
-// Prints a wchar_t as a symbol if it is printable or as its internal
-// code otherwise and also as its decimal code (except for L'\0').
-// The L'\0' char is printed as "L'\\0'". The decimal code is printed
-// as signed integer when wchar_t is implemented by the compiler
-// as a signed type and is printed as an unsigned integer when wchar_t
-// is implemented as an unsigned type.
-GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
-
-// Overloads for C strings.
-GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
-inline void PrintTo(char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const char*>(s), os);
-}
-
-// signed/unsigned char is often used for representing binary data, so
-// we print pointers to it as void* to be safe.
-inline void PrintTo(const signed char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-inline void PrintTo(signed char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-inline void PrintTo(unsigned char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-
-// MSVC can be configured to define wchar_t as a typedef of unsigned
-// short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
-// type.  When wchar_t is a typedef, defining an overload for const
-// wchar_t* would cause unsigned short* be printed as a wide string,
-// possibly causing invalid memory accesses.
-#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
-// Overloads for wide C strings
-GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
-inline void PrintTo(wchar_t* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
-}
-#endif
-
-// Overload for C arrays.  Multi-dimensional arrays are printed
-// properly.
-
-// Prints the given number of elements in an array, without printing
-// the curly braces.
-template <typename T>
-void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
-  UniversalPrint(a[0], os);
-  for (size_t i = 1; i != count; i++) {
-    *os << ", ";
-    UniversalPrint(a[i], os);
-  }
-}
-
-// Overloads for ::string and ::std::string.
-#if GTEST_HAS_GLOBAL_STRING
-GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os);
-inline void PrintTo(const ::string& s, ::std::ostream* os) {
-  PrintStringTo(s, os);
-}
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
-inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
-  PrintStringTo(s, os);
-}
-
-// Overloads for ::wstring and ::std::wstring.
-#if GTEST_HAS_GLOBAL_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os);
-inline void PrintTo(const ::wstring& s, ::std::ostream* os) {
-  PrintWideStringTo(s, os);
-}
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-#if GTEST_HAS_STD_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
-inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
-  PrintWideStringTo(s, os);
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-#if GTEST_HAS_TR1_TUPLE
-// Overload for ::std::tr1::tuple.  Needed for printing function arguments,
-// which are packed as tuples.
-
-// Helper function for printing a tuple.  T must be instantiated with
-// a tuple type.
-template <typename T>
-void PrintTupleTo(const T& t, ::std::ostream* os);
-
-// Overloaded PrintTo() for tuples of various arities.  We support
-// tuples of up-to 10 fields.  The following implementation works
-// regardless of whether tr1::tuple is implemented using the
-// non-standard variadic template feature or not.
-
-inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1>
-void PrintTo(const ::std::tr1::tuple<T1>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2>
-void PrintTo(const ::std::tr1::tuple<T1, T2>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7, typename T8>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7, typename T8, typename T9>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7, typename T8, typename T9, typename T10>
-void PrintTo(
-    const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>& t,
-    ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-#endif  // GTEST_HAS_TR1_TUPLE
-
-// Overload for std::pair.
-template <typename T1, typename T2>
-void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
-  *os << '(';
-  // We cannot use UniversalPrint(value.first, os) here, as T1 may be
-  // a reference type.  The same for printing value.second.
-  UniversalPrinter<T1>::Print(value.first, os);
-  *os << ", ";
-  UniversalPrinter<T2>::Print(value.second, os);
-  *os << ')';
-}
-
-// Implements printing a non-reference type T by letting the compiler
-// pick the right overload of PrintTo() for T.
-template <typename T>
-class UniversalPrinter {
- public:
-  // MSVC warns about adding const to a function type, so we want to
-  // disable the warning.
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4180)  // Temporarily disables warning 4180.
-#endif  // _MSC_VER
-
-  // Note: we deliberately don't call this PrintTo(), as that name
-  // conflicts with ::testing::internal::PrintTo in the body of the
-  // function.
-  static void Print(const T& value, ::std::ostream* os) {
-    // By default, ::testing::internal::PrintTo() is used for printing
-    // the value.
-    //
-    // Thanks to Koenig look-up, if T is a class and has its own
-    // PrintTo() function defined in its namespace, that function will
-    // be visible here.  Since it is more specific than the generic ones
-    // in ::testing::internal, it will be picked by the compiler in the
-    // following statement - exactly what we want.
-    PrintTo(value, os);
-  }
-
-#ifdef _MSC_VER
-# pragma warning(pop)           // Restores the warning state.
-#endif  // _MSC_VER
-};
-
-// UniversalPrintArray(begin, len, os) prints an array of 'len'
-// elements, starting at address 'begin'.
-template <typename T>
-void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
-  if (len == 0) {
-    *os << "{}";
-  } else {
-    *os << "{ ";
-    const size_t kThreshold = 18;
-    const size_t kChunkSize = 8;
-    // If the array has more than kThreshold elements, we'll have to
-    // omit some details by printing only the first and the last
-    // kChunkSize elements.
-    // TODO(wan@google.com): let the user control the threshold using a flag.
-    if (len <= kThreshold) {
-      PrintRawArrayTo(begin, len, os);
-    } else {
-      PrintRawArrayTo(begin, kChunkSize, os);
-      *os << ", ..., ";
-      PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
-    }
-    *os << " }";
-  }
-}
-// This overload prints a (const) char array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const char* begin, size_t len, ::std::ostream* os);
-
-// This overload prints a (const) wchar_t array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const wchar_t* begin, size_t len, ::std::ostream* os);
-
-// Implements printing an array type T[N].
-template <typename T, size_t N>
-class UniversalPrinter<T[N]> {
- public:
-  // Prints the given array, omitting some elements when there are too
-  // many.
-  static void Print(const T (&a)[N], ::std::ostream* os) {
-    UniversalPrintArray(a, N, os);
-  }
-};
-
-// Implements printing a reference type T&.
-template <typename T>
-class UniversalPrinter<T&> {
- public:
-  // MSVC warns about adding const to a function type, so we want to
-  // disable the warning.
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4180)  // Temporarily disables warning 4180.
-#endif  // _MSC_VER
-
-  static void Print(const T& value, ::std::ostream* os) {
-    // Prints the address of the value.  We use reinterpret_cast here
-    // as static_cast doesn't compile when T is a function type.
-    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
-
-    // Then prints the value itself.
-    UniversalPrint(value, os);
-  }
-
-#ifdef _MSC_VER
-# pragma warning(pop)           // Restores the warning state.
-#endif  // _MSC_VER
-};
-
-// Prints a value tersely: for a reference type, the referenced value
-// (but not the address) is printed; for a (const) char pointer, the
-// NUL-terminated string (but not the pointer) is printed.
-
-template <typename T>
-class UniversalTersePrinter {
- public:
-  static void Print(const T& value, ::std::ostream* os) {
-    UniversalPrint(value, os);
-  }
-};
-template <typename T>
-class UniversalTersePrinter<T&> {
- public:
-  static void Print(const T& value, ::std::ostream* os) {
-    UniversalPrint(value, os);
-  }
-};
-template <typename T, size_t N>
-class UniversalTersePrinter<T[N]> {
- public:
-  static void Print(const T (&value)[N], ::std::ostream* os) {
-    UniversalPrinter<T[N]>::Print(value, os);
-  }
-};
-template <>
-class UniversalTersePrinter<const char*> {
- public:
-  static void Print(const char* str, ::std::ostream* os) {
-    if (str == NULL) {
-      *os << "NULL";
-    } else {
-      UniversalPrint(string(str), os);
-    }
-  }
-};
-template <>
-class UniversalTersePrinter<char*> {
- public:
-  static void Print(char* str, ::std::ostream* os) {
-    UniversalTersePrinter<const char*>::Print(str, os);
-  }
-};
-
-#if GTEST_HAS_STD_WSTRING
-template <>
-class UniversalTersePrinter<const wchar_t*> {
- public:
-  static void Print(const wchar_t* str, ::std::ostream* os) {
-    if (str == NULL) {
-      *os << "NULL";
-    } else {
-      UniversalPrint(::std::wstring(str), os);
-    }
-  }
-};
-#endif
-
-template <>
-class UniversalTersePrinter<wchar_t*> {
- public:
-  static void Print(wchar_t* str, ::std::ostream* os) {
-    UniversalTersePrinter<const wchar_t*>::Print(str, os);
-  }
-};
-
-template <typename T>
-void UniversalTersePrint(const T& value, ::std::ostream* os) {
-  UniversalTersePrinter<T>::Print(value, os);
-}
-
-// Prints a value using the type inferred by the compiler.  The
-// difference between this and UniversalTersePrint() is that for a
-// (const) char pointer, this prints both the pointer and the
-// NUL-terminated string.
-template <typename T>
-void UniversalPrint(const T& value, ::std::ostream* os) {
-  // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
-  // UniversalPrinter with T directly.
-  typedef T T1;
-  UniversalPrinter<T1>::Print(value, os);
-}
-
-#if GTEST_HAS_TR1_TUPLE
-typedef ::std::vector<string> Strings;
-
-// This helper template allows PrintTo() for tuples and
-// UniversalTersePrintTupleFieldsToStrings() to be defined by
-// induction on the number of tuple fields.  The idea is that
-// TuplePrefixPrinter<N>::PrintPrefixTo(t, os) prints the first N
-// fields in tuple t, and can be defined in terms of
-// TuplePrefixPrinter<N - 1>.
-
-// The inductive case.
-template <size_t N>
-struct TuplePrefixPrinter {
-  // Prints the first N fields of a tuple.
-  template <typename Tuple>
-  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
-    TuplePrefixPrinter<N - 1>::PrintPrefixTo(t, os);
-    *os << ", ";
-    UniversalPrinter<typename ::std::tr1::tuple_element<N - 1, Tuple>::type>
-        ::Print(::std::tr1::get<N - 1>(t), os);
-  }
-
-  // Tersely prints the first N fields of a tuple to a string vector,
-  // one element for each field.
-  template <typename Tuple>
-  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
-    TuplePrefixPrinter<N - 1>::TersePrintPrefixToStrings(t, strings);
-    ::std::stringstream ss;
-    UniversalTersePrint(::std::tr1::get<N - 1>(t), &ss);
-    strings->push_back(ss.str());
-  }
-};
-
-// Base cases.
-template <>
-struct TuplePrefixPrinter<0> {
-  template <typename Tuple>
-  static void PrintPrefixTo(const Tuple&, ::std::ostream*) {}
-
-  template <typename Tuple>
-  static void TersePrintPrefixToStrings(const Tuple&, Strings*) {}
-};
-// We have to specialize the entire TuplePrefixPrinter<> class
-// template here, even though the definition of
-// TersePrintPrefixToStrings() is the same as the generic version, as
-// Embarcadero (formerly CodeGear, formerly Borland) C++ doesn't
-// support specializing a method template of a class template.
-template <>
-struct TuplePrefixPrinter<1> {
-  template <typename Tuple>
-  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
-    UniversalPrinter<typename ::std::tr1::tuple_element<0, Tuple>::type>::
-        Print(::std::tr1::get<0>(t), os);
-  }
-
-  template <typename Tuple>
-  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
-    ::std::stringstream ss;
-    UniversalTersePrint(::std::tr1::get<0>(t), &ss);
-    strings->push_back(ss.str());
-  }
-};
-
-// Helper function for printing a tuple.  T must be instantiated with
-// a tuple type.
-template <typename T>
-void PrintTupleTo(const T& t, ::std::ostream* os) {
-  *os << "(";
-  TuplePrefixPrinter< ::std::tr1::tuple_size<T>::value>::
-      PrintPrefixTo(t, os);
-  *os << ")";
-}
-
-// Prints the fields of a tuple tersely to a string vector, one
-// element for each field.  See the comment before
-// UniversalTersePrint() for how we define "tersely".
-template <typename Tuple>
-Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
-  Strings result;
-  TuplePrefixPrinter< ::std::tr1::tuple_size<Tuple>::value>::
-      TersePrintPrefixToStrings(value, &result);
-  return result;
-}
-#endif  // GTEST_HAS_TR1_TUPLE
-
-}  // namespace internal
-
-template <typename T>
-::std::string PrintToString(const T& value) {
-  ::std::stringstream ss;
-  internal::UniversalTersePrinter<T>::Print(value, &ss);
-  return ss.str();
-}
-
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-
-#if GTEST_HAS_PARAM_TEST
-
-namespace testing {
-namespace internal {
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Outputs a message explaining invalid registration of different
-// fixture class for the same test case. This may happen when
-// TEST_P macro is used to define two tests with the same name
-// but in different namespaces.
-GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name,
-                                          const char* file, int line);
-
-template <typename> class ParamGeneratorInterface;
-template <typename> class ParamGenerator;
-
-// Interface for iterating over elements provided by an implementation
-// of ParamGeneratorInterface<T>.
-template <typename T>
-class ParamIteratorInterface {
- public:
-  virtual ~ParamIteratorInterface() {}
-  // A pointer to the base generator instance.
-  // Used only for the purposes of iterator comparison
-  // to make sure that two iterators belong to the same generator.
-  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
-  // Advances iterator to point to the next element
-  // provided by the generator. The caller is responsible
-  // for not calling Advance() on an iterator equal to
-  // BaseGenerator()->End().
-  virtual void Advance() = 0;
-  // Clones the iterator object. Used for implementing copy semantics
-  // of ParamIterator<T>.
-  virtual ParamIteratorInterface* Clone() const = 0;
-  // Dereferences the current iterator and provides (read-only) access
-  // to the pointed value. It is the caller's responsibility not to call
-  // Current() on an iterator equal to BaseGenerator()->End().
-  // Used for implementing ParamGenerator<T>::operator*().
-  virtual const T* Current() const = 0;
-  // Determines whether the given iterator and other point to the same
-  // element in the sequence generated by the generator.
-  // Used for implementing ParamGenerator<T>::operator==().
-  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
-};
-
-// Class iterating over elements provided by an implementation of
-// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
-// and implements the const forward iterator concept.
-template <typename T>
-class ParamIterator {
- public:
-  typedef T value_type;
-  typedef const T& reference;
-  typedef ptrdiff_t difference_type;
-
-  // ParamIterator assumes ownership of the impl_ pointer.
-  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
-  ParamIterator& operator=(const ParamIterator& other) {
-    if (this != &other)
-      impl_.reset(other.impl_->Clone());
-    return *this;
-  }
-
-  const T& operator*() const { return *impl_->Current(); }
-  const T* operator->() const { return impl_->Current(); }
-  // Prefix version of operator++.
-  ParamIterator& operator++() {
-    impl_->Advance();
-    return *this;
-  }
-  // Postfix version of operator++.
-  ParamIterator operator++(int /*unused*/) {
-    ParamIteratorInterface<T>* clone = impl_->Clone();
-    impl_->Advance();
-    return ParamIterator(clone);
-  }
-  bool operator==(const ParamIterator& other) const {
-    return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
-  }
-  bool operator!=(const ParamIterator& other) const {
-    return !(*this == other);
-  }
-
- private:
-  friend class ParamGenerator<T>;
-  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
-  scoped_ptr<ParamIteratorInterface<T> > impl_;
-};
-
-// ParamGeneratorInterface<T> is the binary interface to access generators
-// defined in other translation units.
-template <typename T>
-class ParamGeneratorInterface {
- public:
-  typedef T ParamType;
-
-  virtual ~ParamGeneratorInterface() {}
-
-  // Generator interface definition
-  virtual ParamIteratorInterface<T>* Begin() const = 0;
-  virtual ParamIteratorInterface<T>* End() const = 0;
-};
-
-// Wraps ParamGeneratorInterface<T> and provides general generator syntax
-// compatible with the STL Container concept.
-// This class implements copy initialization semantics and the contained
-// ParamGeneratorInterface<T> instance is shared among all copies
-// of the original object. This is possible because that instance is immutable.
-template<typename T>
-class ParamGenerator {
- public:
-  typedef ParamIterator<T> iterator;
-
-  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
-  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
-
-  ParamGenerator& operator=(const ParamGenerator& other) {
-    impl_ = other.impl_;
-    return *this;
-  }
-
-  iterator begin() const { return iterator(impl_->Begin()); }
-  iterator end() const { return iterator(impl_->End()); }
-
- private:
-  linked_ptr<const ParamGeneratorInterface<T> > impl_;
-};
-
-// Generates values from a range of two comparable values. Can be used to
-// generate sequences of user-defined types that implement operator+() and
-// operator<().
-// This class is used in the Range() function.
-template <typename T, typename IncrementT>
-class RangeGenerator : public ParamGeneratorInterface<T> {
- public:
-  RangeGenerator(T begin, T end, IncrementT step)
-      : begin_(begin), end_(end),
-        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
-  virtual ~RangeGenerator() {}
-
-  virtual ParamIteratorInterface<T>* Begin() const {
-    return new Iterator(this, begin_, 0, step_);
-  }
-  virtual ParamIteratorInterface<T>* End() const {
-    return new Iterator(this, end_, end_index_, step_);
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<T> {
-   public:
-    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
-             IncrementT step)
-        : base_(base), value_(value), index_(index), step_(step) {}
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
-      return base_;
-    }
-    virtual void Advance() {
-      value_ = value_ + step_;
-      index_++;
-    }
-    virtual ParamIteratorInterface<T>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const T* Current() const { return &value_; }
-    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const int other_index =
-          CheckedDowncastToActualType<const Iterator>(&other)->index_;
-      return index_ == other_index;
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : ParamIteratorInterface<T>(),
-          base_(other.base_), value_(other.value_), index_(other.index_),
-          step_(other.step_) {}
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<T>* const base_;
-    T value_;
-    int index_;
-    const IncrementT step_;
-  };  // class RangeGenerator::Iterator
-
-  static int CalculateEndIndex(const T& begin,
-                               const T& end,
-                               const IncrementT& step) {
-    int end_index = 0;
-    for (T i = begin; i < end; i = i + step)
-      end_index++;
-    return end_index;
-  }
-
-  // No implementation - assignment is unsupported.
-  void operator=(const RangeGenerator& other);
-
-  const T begin_;
-  const T end_;
-  const IncrementT step_;
-  // The index for the end() iterator. All the elements in the generated
-  // sequence are indexed (0-based) to aid iterator comparison.
-  const int end_index_;
-};  // class RangeGenerator
-
-
-// Generates values from a pair of STL-style iterators. Used in the
-// ValuesIn() function. The elements are copied from the source range
-// since the source can be located on the stack, and the generator
-// is likely to persist beyond that stack frame.
-template <typename T>
-class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
- public:
-  template <typename ForwardIterator>
-  ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
-      : container_(begin, end) {}
-  virtual ~ValuesInIteratorRangeGenerator() {}
-
-  virtual ParamIteratorInterface<T>* Begin() const {
-    return new Iterator(this, container_.begin());
-  }
-  virtual ParamIteratorInterface<T>* End() const {
-    return new Iterator(this, container_.end());
-  }
-
- private:
-  typedef typename ::std::vector<T> ContainerType;
-
-  class Iterator : public ParamIteratorInterface<T> {
-   public:
-    Iterator(const ParamGeneratorInterface<T>* base,
-             typename ContainerType::const_iterator iterator)
-        : base_(base), iterator_(iterator) {}
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
-      return base_;
-    }
-    virtual void Advance() {
-      ++iterator_;
-      value_.reset();
-    }
-    virtual ParamIteratorInterface<T>* Clone() const {
-      return new Iterator(*this);
-    }
-    // We need to use cached value referenced by iterator_ because *iterator_
-    // can return a temporary object (and of type other then T), so just
-    // having "return &*iterator_;" doesn't work.
-    // value_ is updated here and not in Advance() because Advance()
-    // can advance iterator_ beyond the end of the range, and we cannot
-    // detect that fact. The client code, on the other hand, is
-    // responsible for not calling Current() on an out-of-range iterator.
-    virtual const T* Current() const {
-      if (value_.get() == NULL)
-        value_.reset(new T(*iterator_));
-      return value_.get();
-    }
-    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      return iterator_ ==
-          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
-    }
-
-   private:
-    Iterator(const Iterator& other)
-          // The explicit constructor call suppresses a false warning
-          // emitted by gcc when supplied with the -Wextra option.
-        : ParamIteratorInterface<T>(),
-          base_(other.base_),
-          iterator_(other.iterator_) {}
-
-    const ParamGeneratorInterface<T>* const base_;
-    typename ContainerType::const_iterator iterator_;
-    // A cached value of *iterator_. We keep it here to allow access by
-    // pointer in the wrapping iterator's operator->().
-    // value_ needs to be mutable to be accessed in Current().
-    // Use of scoped_ptr helps manage cached value's lifetime,
-    // which is bound by the lifespan of the iterator itself.
-    mutable scoped_ptr<const T> value_;
-  };  // class ValuesInIteratorRangeGenerator::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const ValuesInIteratorRangeGenerator& other);
-
-  const ContainerType container_;
-};  // class ValuesInIteratorRangeGenerator
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Stores a parameter value and later creates tests parameterized with that
-// value.
-template <class TestClass>
-class ParameterizedTestFactory : public TestFactoryBase {
- public:
-  typedef typename TestClass::ParamType ParamType;
-  explicit ParameterizedTestFactory(ParamType parameter) :
-      parameter_(parameter) {}
-  virtual Test* CreateTest() {
-    TestClass::SetParam(&parameter_);
-    return new TestClass();
-  }
-
- private:
-  const ParamType parameter_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// TestMetaFactoryBase is a base class for meta-factories that create
-// test factories for passing into MakeAndRegisterTestInfo function.
-template <class ParamType>
-class TestMetaFactoryBase {
- public:
-  virtual ~TestMetaFactoryBase() {}
-
-  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// TestMetaFactory creates test factories for passing into
-// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
-// ownership of test factory pointer, same factory object cannot be passed
-// into that method twice. But ParameterizedTestCaseInfo is going to call
-// it for each Test/Parameter value combination. Thus it needs meta factory
-// creator class.
-template <class TestCase>
-class TestMetaFactory
-    : public TestMetaFactoryBase<typename TestCase::ParamType> {
- public:
-  typedef typename TestCase::ParamType ParamType;
-
-  TestMetaFactory() {}
-
-  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) {
-    return new ParameterizedTestFactory<TestCase>(parameter);
-  }
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// ParameterizedTestCaseInfoBase is a generic interface
-// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase
-// accumulates test information provided by TEST_P macro invocations
-// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations
-// and uses that information to register all resulting test instances
-// in RegisterTests method. The ParameterizeTestCaseRegistry class holds
-// a collection of pointers to the ParameterizedTestCaseInfo objects
-// and calls RegisterTests() on each of them when asked.
-class ParameterizedTestCaseInfoBase {
- public:
-  virtual ~ParameterizedTestCaseInfoBase() {}
-
-  // Base part of test case name for display purposes.
-  virtual const string& GetTestCaseName() const = 0;
-  // Test case id to verify identity.
-  virtual TypeId GetTestCaseTypeId() const = 0;
-  // UnitTest class invokes this method to register tests in this
-  // test case right before running them in RUN_ALL_TESTS macro.
-  // This method should not be called more then once on any single
-  // instance of a ParameterizedTestCaseInfoBase derived class.
-  virtual void RegisterTests() = 0;
-
- protected:
-  ParameterizedTestCaseInfoBase() {}
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase);
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P
-// macro invocations for a particular test case and generators
-// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that
-// test case. It registers tests with all values generated by all
-// generators when asked.
-template <class TestCase>
-class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
- public:
-  // ParamType and GeneratorCreationFunc are private types but are required
-  // for declarations of public methods AddTestPattern() and
-  // AddTestCaseInstantiation().
-  typedef typename TestCase::ParamType ParamType;
-  // A function that returns an instance of appropriate generator type.
-  typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
-
-  explicit ParameterizedTestCaseInfo(const char* name)
-      : test_case_name_(name) {}
-
-  // Test case base name for display purposes.
-  virtual const string& GetTestCaseName() const { return test_case_name_; }
-  // Test case id to verify identity.
-  virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
-  // TEST_P macro uses AddTestPattern() to record information
-  // about a single test in a LocalTestInfo structure.
-  // test_case_name is the base name of the test case (without invocation
-  // prefix). test_base_name is the name of an individual test without
-  // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
-  // test case base name and DoBar is test base name.
-  void AddTestPattern(const char* test_case_name,
-                      const char* test_base_name,
-                      TestMetaFactoryBase<ParamType>* meta_factory) {
-    tests_.push_back(linked_ptr<TestInfo>(new TestInfo(test_case_name,
-                                                       test_base_name,
-                                                       meta_factory)));
-  }
-  // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information
-  // about a generator.
-  int AddTestCaseInstantiation(const string& instantiation_name,
-                               GeneratorCreationFunc* func,
-                               const char* /* file */,
-                               int /* line */) {
-    instantiations_.push_back(::std::make_pair(instantiation_name, func));
-    return 0;  // Return value used only to run this method in namespace scope.
-  }
-  // UnitTest class invokes this method to register tests in this test case
-  // test cases right before running tests in RUN_ALL_TESTS macro.
-  // This method should not be called more then once on any single
-  // instance of a ParameterizedTestCaseInfoBase derived class.
-  // UnitTest has a guard to prevent from calling this method more then once.
-  virtual void RegisterTests() {
-    for (typename TestInfoContainer::iterator test_it = tests_.begin();
-         test_it != tests_.end(); ++test_it) {
-      linked_ptr<TestInfo> test_info = *test_it;
-      for (typename InstantiationContainer::iterator gen_it =
-               instantiations_.begin(); gen_it != instantiations_.end();
-               ++gen_it) {
-        const string& instantiation_name = gen_it->first;
-        ParamGenerator<ParamType> generator((*gen_it->second)());
-
-        string test_case_name;
-        if ( !instantiation_name.empty() )
-          test_case_name = instantiation_name + "/";
-        test_case_name += test_info->test_case_base_name;
-
-        int i = 0;
-        for (typename ParamGenerator<ParamType>::iterator param_it =
-                 generator.begin();
-             param_it != generator.end(); ++param_it, ++i) {
-          Message test_name_stream;
-          test_name_stream << test_info->test_base_name << "/" << i;
-          MakeAndRegisterTestInfo(
-              test_case_name.c_str(),
-              test_name_stream.GetString().c_str(),
-              NULL,  // No type parameter.
-              PrintToString(*param_it).c_str(),
-              GetTestCaseTypeId(),
-              TestCase::SetUpTestCase,
-              TestCase::TearDownTestCase,
-              test_info->test_meta_factory->CreateTestFactory(*param_it));
-        }  // for param_it
-      }  // for gen_it
-    }  // for test_it
-  }  // RegisterTests
-
- private:
-  // LocalTestInfo structure keeps information about a single test registered
-  // with TEST_P macro.
-  struct TestInfo {
-    TestInfo(const char* a_test_case_base_name,
-             const char* a_test_base_name,
-             TestMetaFactoryBase<ParamType>* a_test_meta_factory) :
-        test_case_base_name(a_test_case_base_name),
-        test_base_name(a_test_base_name),
-        test_meta_factory(a_test_meta_factory) {}
-
-    const string test_case_base_name;
-    const string test_base_name;
-    const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
-  };
-  typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
-  // Keeps pairs of <Instantiation name, Sequence generator creation function>
-  // received from INSTANTIATE_TEST_CASE_P macros.
-  typedef ::std::vector<std::pair<string, GeneratorCreationFunc*> >
-      InstantiationContainer;
-
-  const string test_case_name_;
-  TestInfoContainer tests_;
-  InstantiationContainer instantiations_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo);
-};  // class ParameterizedTestCaseInfo
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase
-// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P
-// macros use it to locate their corresponding ParameterizedTestCaseInfo
-// descriptors.
-class ParameterizedTestCaseRegistry {
- public:
-  ParameterizedTestCaseRegistry() {}
-  ~ParameterizedTestCaseRegistry() {
-    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
-         it != test_case_infos_.end(); ++it) {
-      delete *it;
-    }
-  }
-
-  // Looks up or creates and returns a structure containing information about
-  // tests and instantiations of a particular test case.
-  template <class TestCase>
-  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
-      const char* test_case_name,
-      const char* file,
-      int line) {
-    ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL;
-    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
-         it != test_case_infos_.end(); ++it) {
-      if ((*it)->GetTestCaseName() == test_case_name) {
-        if ((*it)->GetTestCaseTypeId() != GetTypeId<TestCase>()) {
-          // Complain about incorrect usage of Google Test facilities
-          // and terminate the program since we cannot guaranty correct
-          // test case setup and tear-down in this case.
-          ReportInvalidTestCaseType(test_case_name,  file, line);
-          posix::Abort();
-        } else {
-          // At this point we are sure that the object we found is of the same
-          // type we are looking for, so we downcast it to that type
-          // without further checks.
-          typed_test_info = CheckedDowncastToActualType<
-              ParameterizedTestCaseInfo<TestCase> >(*it);
-        }
-        break;
-      }
-    }
-    if (typed_test_info == NULL) {
-      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(test_case_name);
-      test_case_infos_.push_back(typed_test_info);
-    }
-    return typed_test_info;
-  }
-  void RegisterTests() {
-    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
-         it != test_case_infos_.end(); ++it) {
-      (*it)->RegisterTests();
-    }
-  }
-
- private:
-  typedef ::std::vector<ParameterizedTestCaseInfoBase*> TestCaseInfoContainer;
-
-  TestCaseInfoContainer test_case_infos_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry);
-};
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  //  GTEST_HAS_PARAM_TEST
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-// This file was GENERATED by command:
-//     pump.py gtest-param-util-generated.h.pump
-// DO NOT EDIT BY HAND!!!
-
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
-
-// Type and function utilities for implementing parameterized tests.
-// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// Currently Google Test supports at most 50 arguments in Values,
-// and at most 10 arguments in Combine. Please contact
-// googletestframework@googlegroups.com if you need more.
-// Please note that the number of arguments to Combine is limited
-// by the maximum arity of the implementation of tr1::tuple which is
-// currently set at 10.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
-
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
-
-#if GTEST_HAS_PARAM_TEST
-
-namespace testing {
-
-// Forward declarations of ValuesIn(), which is implemented in
-// include/gtest/gtest-param-test.h.
-template <typename ForwardIterator>
-internal::ParamGenerator<
-  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
-ValuesIn(ForwardIterator begin, ForwardIterator end);
-
-template <typename T, size_t N>
-internal::ParamGenerator<T> ValuesIn(const T (&array)[N]);
-
-template <class Container>
-internal::ParamGenerator<typename Container::value_type> ValuesIn(
-    const Container& container);
-
-namespace internal {
-
-// Used in the Values() function to provide polymorphic capabilities.
-template <typename T1>
-class ValueArray1 {
- public:
-  explicit ValueArray1(T1 v1) : v1_(v1) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const { return ValuesIn(&v1_, &v1_ + 1); }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray1& other);
-
-  const T1 v1_;
-};
-
-template <typename T1, typename T2>
-class ValueArray2 {
- public:
-  ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray2& other);
-
-  const T1 v1_;
-  const T2 v2_;
-};
-
-template <typename T1, typename T2, typename T3>
-class ValueArray3 {
- public:
-  ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray3& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4>
-class ValueArray4 {
- public:
-  ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray4& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class ValueArray5 {
- public:
-  ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray5& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-class ValueArray6 {
- public:
-  ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray6& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-class ValueArray7 {
- public:
-  ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray7& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-class ValueArray8 {
- public:
-  ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-      T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray8& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-class ValueArray9 {
- public:
-  ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-      T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray9& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-class ValueArray10 {
- public:
-  ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray10& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-class ValueArray11 {
- public:
-  ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray11& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-class ValueArray12 {
- public:
-  ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray12& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-class ValueArray13 {
- public:
-  ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray13& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-class ValueArray14 {
- public:
-  ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray14& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-class ValueArray15 {
- public:
-  ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray15& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-class ValueArray16 {
- public:
-  ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray16& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-class ValueArray17 {
- public:
-  ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
-      T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray17& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-class ValueArray18 {
- public:
-  ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray18& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-class ValueArray19 {
- public:
-  ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray19& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-class ValueArray20 {
- public:
-  ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray20& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-class ValueArray21 {
- public:
-  ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray21& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-class ValueArray22 {
- public:
-  ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray22& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-class ValueArray23 {
- public:
-  ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray23& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-class ValueArray24 {
- public:
-  ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray24& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-class ValueArray25 {
- public:
-  ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
-      T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray25& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-class ValueArray26 {
- public:
-  ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray26& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-class ValueArray27 {
- public:
-  ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
-      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
-      v26_(v26), v27_(v27) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray27& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-class ValueArray28 {
- public:
-  ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
-      v25_(v25), v26_(v26), v27_(v27), v28_(v28) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray28& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-class ValueArray29 {
- public:
-  ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
-      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray29& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-class ValueArray30 {
- public:
-  ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray30& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-class ValueArray31 {
- public:
-  ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray31& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-class ValueArray32 {
- public:
-  ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
-      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray32& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-class ValueArray33 {
- public:
-  ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
-      T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray33& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-class ValueArray34 {
- public:
-  ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray34& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-class ValueArray35 {
- public:
-  ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
-      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
-      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
-      v32_(v32), v33_(v33), v34_(v34), v35_(v35) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray35& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-class ValueArray36 {
- public:
-  ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
-      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
-      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray36& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-class ValueArray37 {
- public:
-  ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
-      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
-      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
-      v36_(v36), v37_(v37) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray37& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-class ValueArray38 {
- public:
-  ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray38& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-class ValueArray39 {
- public:
-  ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray39& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-class ValueArray40 {
- public:
-  ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
-      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
-      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
-      v40_(v40) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray40& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-class ValueArray41 {
- public:
-  ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
-      T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray41& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-class ValueArray42 {
- public:
-  ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41), v42_(v42) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray42& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-class ValueArray43 {
- public:
-  ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
-      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
-      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
-      v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37),
-      v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray43& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-class ValueArray44 {
- public:
-  ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
-      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
-      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36),
-      v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42),
-      v43_(v43), v44_(v44) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray44& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-class ValueArray45 {
- public:
-  ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
-      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
-      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
-      v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41),
-      v42_(v42), v43_(v43), v44_(v44), v45_(v45) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray45& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-class ValueArray46 {
- public:
-  ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
-      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray46& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-class ValueArray47 {
- public:
-  ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
-      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46),
-      v47_(v47) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray47& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-class ValueArray48 {
- public:
-  ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
-      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
-      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
-      v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45),
-      v46_(v46), v47_(v47), v48_(v48) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
-        static_cast<T>(v48_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray48& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-  const T48 v48_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-class ValueArray49 {
- public:
-  ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48,
-      T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
-      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
-        static_cast<T>(v48_), static_cast<T>(v49_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray49& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-  const T48 v48_;
-  const T49 v49_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-class ValueArray50 {
- public:
-  ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49,
-      T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
-      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
-        static_cast<T>(v48_), static_cast<T>(v49_), static_cast<T>(v50_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray50& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-  const T48 v48_;
-  const T49 v49_;
-  const T50 v50_;
-};
-
-# if GTEST_HAS_COMBINE
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Generates values from the Cartesian product of values produced
-// by the argument generators.
-//
-template <typename T1, typename T2>
-class CartesianProductGenerator2
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2> ParamType;
-
-  CartesianProductGenerator2(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2)
-      : g1_(g1), g2_(g2) {}
-  virtual ~CartesianProductGenerator2() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current2_;
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator2::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator2& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-};  // class CartesianProductGenerator2
-
-
-template <typename T1, typename T2, typename T3>
-class CartesianProductGenerator3
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3> ParamType;
-
-  CartesianProductGenerator3(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3)
-      : g1_(g1), g2_(g2), g3_(g3) {}
-  virtual ~CartesianProductGenerator3() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current3_;
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator3::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator3& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-};  // class CartesianProductGenerator3
-
-
-template <typename T1, typename T2, typename T3, typename T4>
-class CartesianProductGenerator4
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4> ParamType;
-
-  CartesianProductGenerator4(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
-  virtual ~CartesianProductGenerator4() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current4_;
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator4::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator4& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-};  // class CartesianProductGenerator4
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class CartesianProductGenerator5
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5> ParamType;
-
-  CartesianProductGenerator5(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
-  virtual ~CartesianProductGenerator5() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current5_;
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator5::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator5& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-};  // class CartesianProductGenerator5
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-class CartesianProductGenerator6
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5,
-        T6> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> ParamType;
-
-  CartesianProductGenerator6(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
-  virtual ~CartesianProductGenerator6() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current6_;
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator6::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator6& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-};  // class CartesianProductGenerator6
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-class CartesianProductGenerator7
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-        T7> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> ParamType;
-
-  CartesianProductGenerator7(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
-  virtual ~CartesianProductGenerator7() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current7_;
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator7::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator7& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-};  // class CartesianProductGenerator7
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-class CartesianProductGenerator8
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-        T7, T8> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> ParamType;
-
-  CartesianProductGenerator8(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
-      const ParamGenerator<T8>& g8)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
-          g8_(g8) {}
-  virtual ~CartesianProductGenerator8() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin(), g8_, g8_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
-        g8_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7,
-      const ParamGenerator<T8>& g8,
-      const typename ParamGenerator<T8>::iterator& current8)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
-          begin8_(g8.begin()), end8_(g8.end()), current8_(current8)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current8_;
-      if (current8_ == end8_) {
-        current8_ = begin8_;
-        ++current7_;
-      }
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_ &&
-          current8_ == typed_other->current8_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_),
-        begin8_(other.begin8_),
-        end8_(other.end8_),
-        current8_(other.current8_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_ ||
-          current8_ == end8_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    const typename ParamGenerator<T8>::iterator begin8_;
-    const typename ParamGenerator<T8>::iterator end8_;
-    typename ParamGenerator<T8>::iterator current8_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator8::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator8& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-  const ParamGenerator<T8> g8_;
-};  // class CartesianProductGenerator8
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-class CartesianProductGenerator9
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-        T7, T8, T9> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9> ParamType;
-
-  CartesianProductGenerator9(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
-      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9) {}
-  virtual ~CartesianProductGenerator9() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
-        g8_.end(), g9_, g9_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7,
-      const ParamGenerator<T8>& g8,
-      const typename ParamGenerator<T8>::iterator& current8,
-      const ParamGenerator<T9>& g9,
-      const typename ParamGenerator<T9>::iterator& current9)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
-          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
-          begin9_(g9.begin()), end9_(g9.end()), current9_(current9)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current9_;
-      if (current9_ == end9_) {
-        current9_ = begin9_;
-        ++current8_;
-      }
-      if (current8_ == end8_) {
-        current8_ = begin8_;
-        ++current7_;
-      }
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_ &&
-          current8_ == typed_other->current8_ &&
-          current9_ == typed_other->current9_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_),
-        begin8_(other.begin8_),
-        end8_(other.end8_),
-        current8_(other.current8_),
-        begin9_(other.begin9_),
-        end9_(other.end9_),
-        current9_(other.current9_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_,
-            *current9_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_ ||
-          current8_ == end8_ ||
-          current9_ == end9_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    const typename ParamGenerator<T8>::iterator begin8_;
-    const typename ParamGenerator<T8>::iterator end8_;
-    typename ParamGenerator<T8>::iterator current8_;
-    const typename ParamGenerator<T9>::iterator begin9_;
-    const typename ParamGenerator<T9>::iterator end9_;
-    typename ParamGenerator<T9>::iterator current9_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator9::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator9& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-  const ParamGenerator<T8> g8_;
-  const ParamGenerator<T9> g9_;
-};  // class CartesianProductGenerator9
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-class CartesianProductGenerator10
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-        T7, T8, T9, T10> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> ParamType;
-
-  CartesianProductGenerator10(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
-      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9,
-      const ParamGenerator<T10>& g10)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9), g10_(g10) {}
-  virtual ~CartesianProductGenerator10() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
-        g8_.end(), g9_, g9_.end(), g10_, g10_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7,
-      const ParamGenerator<T8>& g8,
-      const typename ParamGenerator<T8>::iterator& current8,
-      const ParamGenerator<T9>& g9,
-      const typename ParamGenerator<T9>::iterator& current9,
-      const ParamGenerator<T10>& g10,
-      const typename ParamGenerator<T10>::iterator& current10)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
-          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
-          begin9_(g9.begin()), end9_(g9.end()), current9_(current9),
-          begin10_(g10.begin()), end10_(g10.end()), current10_(current10)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current10_;
-      if (current10_ == end10_) {
-        current10_ = begin10_;
-        ++current9_;
-      }
-      if (current9_ == end9_) {
-        current9_ = begin9_;
-        ++current8_;
-      }
-      if (current8_ == end8_) {
-        current8_ = begin8_;
-        ++current7_;
-      }
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_ &&
-          current8_ == typed_other->current8_ &&
-          current9_ == typed_other->current9_ &&
-          current10_ == typed_other->current10_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_),
-        begin8_(other.begin8_),
-        end8_(other.end8_),
-        current8_(other.current8_),
-        begin9_(other.begin9_),
-        end9_(other.end9_),
-        current9_(other.current9_),
-        begin10_(other.begin10_),
-        end10_(other.end10_),
-        current10_(other.current10_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_,
-            *current9_, *current10_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_ ||
-          current8_ == end8_ ||
-          current9_ == end9_ ||
-          current10_ == end10_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    const typename ParamGenerator<T8>::iterator begin8_;
-    const typename ParamGenerator<T8>::iterator end8_;
-    typename ParamGenerator<T8>::iterator current8_;
-    const typename ParamGenerator<T9>::iterator begin9_;
-    const typename ParamGenerator<T9>::iterator end9_;
-    typename ParamGenerator<T9>::iterator current9_;
-    const typename ParamGenerator<T10>::iterator begin10_;
-    const typename ParamGenerator<T10>::iterator end10_;
-    typename ParamGenerator<T10>::iterator current10_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator10::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator10& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-  const ParamGenerator<T8> g8_;
-  const ParamGenerator<T9> g9_;
-  const ParamGenerator<T10> g10_;
-};  // class CartesianProductGenerator10
-
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Helper classes providing Combine() with polymorphic features. They allow
-// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is
-// convertible to U.
-//
-template <class Generator1, class Generator2>
-class CartesianProductHolder2 {
- public:
-CartesianProductHolder2(const Generator1& g1, const Generator2& g2)
-      : g1_(g1), g2_(g2) {}
-  template <typename T1, typename T2>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2> >(
-        new CartesianProductGenerator2<T1, T2>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder2& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-};  // class CartesianProductHolder2
-
-template <class Generator1, class Generator2, class Generator3>
-class CartesianProductHolder3 {
- public:
-CartesianProductHolder3(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3)
-      : g1_(g1), g2_(g2), g3_(g3) {}
-  template <typename T1, typename T2, typename T3>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >(
-        new CartesianProductGenerator3<T1, T2, T3>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder3& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-};  // class CartesianProductHolder3
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4>
-class CartesianProductHolder4 {
- public:
-CartesianProductHolder4(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
-  template <typename T1, typename T2, typename T3, typename T4>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >(
-        new CartesianProductGenerator4<T1, T2, T3, T4>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder4& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-};  // class CartesianProductHolder4
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5>
-class CartesianProductHolder5 {
- public:
-CartesianProductHolder5(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >(
-        new CartesianProductGenerator5<T1, T2, T3, T4, T5>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder5& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-};  // class CartesianProductHolder5
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6>
-class CartesianProductHolder6 {
- public:
-CartesianProductHolder6(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >(
-        new CartesianProductGenerator6<T1, T2, T3, T4, T5, T6>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder6& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-};  // class CartesianProductHolder6
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7>
-class CartesianProductHolder7 {
- public:
-CartesianProductHolder7(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-      T7> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> >(
-        new CartesianProductGenerator7<T1, T2, T3, T4, T5, T6, T7>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder7& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-};  // class CartesianProductHolder7
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7,
-    class Generator8>
-class CartesianProductHolder8 {
- public:
-CartesianProductHolder8(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7, const Generator8& g8)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
-          g8_(g8) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7, typename T8>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7,
-      T8> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> >(
-        new CartesianProductGenerator8<T1, T2, T3, T4, T5, T6, T7, T8>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_),
-        static_cast<ParamGenerator<T8> >(g8_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder8& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-  const Generator8 g8_;
-};  // class CartesianProductHolder8
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7,
-    class Generator8, class Generator9>
-class CartesianProductHolder9 {
- public:
-CartesianProductHolder9(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7, const Generator8& g8,
-    const Generator9& g9)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7, typename T8, typename T9>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-      T9> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-        T9> >(
-        new CartesianProductGenerator9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_),
-        static_cast<ParamGenerator<T8> >(g8_),
-        static_cast<ParamGenerator<T9> >(g9_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder9& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-  const Generator8 g8_;
-  const Generator9 g9_;
-};  // class CartesianProductHolder9
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7,
-    class Generator8, class Generator9, class Generator10>
-class CartesianProductHolder10 {
- public:
-CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7, const Generator8& g8,
-    const Generator9& g9, const Generator10& g10)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9), g10_(g10) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7, typename T8, typename T9, typename T10>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-      T9, T10> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-        T9, T10> >(
-        new CartesianProductGenerator10<T1, T2, T3, T4, T5, T6, T7, T8, T9,
-            T10>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_),
-        static_cast<ParamGenerator<T8> >(g8_),
-        static_cast<ParamGenerator<T9> >(g9_),
-        static_cast<ParamGenerator<T10> >(g10_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder10& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-  const Generator8 g8_;
-  const Generator9 g9_;
-  const Generator10 g10_;
-};  // class CartesianProductHolder10
-
-# endif  // GTEST_HAS_COMBINE
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  //  GTEST_HAS_PARAM_TEST
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
-
-#if GTEST_HAS_PARAM_TEST
-
-namespace testing {
-
-// Functions producing parameter generators.
-//
-// Google Test uses these generators to produce parameters for value-
-// parameterized tests. When a parameterized test case is instantiated
-// with a particular generator, Google Test creates and runs tests
-// for each element in the sequence produced by the generator.
-//
-// In the following sample, tests from test case FooTest are instantiated
-// each three times with parameter values 3, 5, and 8:
-//
-// class FooTest : public TestWithParam<int> { ... };
-//
-// TEST_P(FooTest, TestThis) {
-// }
-// TEST_P(FooTest, TestThat) {
-// }
-// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8));
-//
-
-// Range() returns generators providing sequences of values in a range.
-//
-// Synopsis:
-// Range(start, end)
-//   - returns a generator producing a sequence of values {start, start+1,
-//     start+2, ..., }.
-// Range(start, end, step)
-//   - returns a generator producing a sequence of values {start, start+step,
-//     start+step+step, ..., }.
-// Notes:
-//   * The generated sequences never include end. For example, Range(1, 5)
-//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
-//     returns a generator producing {1, 3, 5, 7}.
-//   * start and end must have the same type. That type may be any integral or
-//     floating-point type or a user defined type satisfying these conditions:
-//     * It must be assignable (have operator=() defined).
-//     * It must have operator+() (operator+(int-compatible type) for
-//       two-operand version).
-//     * It must have operator<() defined.
-//     Elements in the resulting sequences will also have that type.
-//   * Condition start < end must be satisfied in order for resulting sequences
-//     to contain any elements.
-//
-template <typename T, typename IncrementT>
-internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
-  return internal::ParamGenerator<T>(
-      new internal::RangeGenerator<T, IncrementT>(start, end, step));
-}
-
-template <typename T>
-internal::ParamGenerator<T> Range(T start, T end) {
-  return Range(start, end, 1);
-}
-
-// ValuesIn() function allows generation of tests with parameters coming from
-// a container.
-//
-// Synopsis:
-// ValuesIn(const T (&array)[N])
-//   - returns a generator producing sequences with elements from
-//     a C-style array.
-// ValuesIn(const Container& container)
-//   - returns a generator producing sequences with elements from
-//     an STL-style container.
-// ValuesIn(Iterator begin, Iterator end)
-//   - returns a generator producing sequences with elements from
-//     a range [begin, end) defined by a pair of STL-style iterators. These
-//     iterators can also be plain C pointers.
-//
-// Please note that ValuesIn copies the values from the containers
-// passed in and keeps them to generate tests in RUN_ALL_TESTS().
-//
-// Examples:
-//
-// This instantiates tests from test case StringTest
-// each with C-string values of "foo", "bar", and "baz":
-//
-// const char* strings[] = {"foo", "bar", "baz"};
-// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
-//
-// This instantiates tests from test case StlStringTest
-// each with STL strings with values "a" and "b":
-//
-// ::std::vector< ::std::string> GetParameterStrings() {
-//   ::std::vector< ::std::string> v;
-//   v.push_back("a");
-//   v.push_back("b");
-//   return v;
-// }
-//
-// INSTANTIATE_TEST_CASE_P(CharSequence,
-//                         StlStringTest,
-//                         ValuesIn(GetParameterStrings()));
-//
-//
-// This will also instantiate tests from CharTest
-// each with parameter values 'a' and 'b':
-//
-// ::std::list<char> GetParameterChars() {
-//   ::std::list<char> list;
-//   list.push_back('a');
-//   list.push_back('b');
-//   return list;
-// }
-// ::std::list<char> l = GetParameterChars();
-// INSTANTIATE_TEST_CASE_P(CharSequence2,
-//                         CharTest,
-//                         ValuesIn(l.begin(), l.end()));
-//
-template <typename ForwardIterator>
-internal::ParamGenerator<
-  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
-ValuesIn(ForwardIterator begin, ForwardIterator end) {
-  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
-      ::value_type ParamType;
-  return internal::ParamGenerator<ParamType>(
-      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
-}
-
-template <typename T, size_t N>
-internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
-  return ValuesIn(array, array + N);
-}
-
-template <class Container>
-internal::ParamGenerator<typename Container::value_type> ValuesIn(
-    const Container& container) {
-  return ValuesIn(container.begin(), container.end());
-}
-
-// Values() allows generating tests from explicitly specified list of
-// parameters.
-//
-// Synopsis:
-// Values(T v1, T v2, ..., T vN)
-//   - returns a generator producing sequences with elements v1, v2, ..., vN.
-//
-// For example, this instantiates tests from test case BarTest each
-// with values "one", "two", and "three":
-//
-// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three"));
-//
-// This instantiates tests from test case BazTest each with values 1, 2, 3.5.
-// The exact type of values will depend on the type of parameter in BazTest.
-//
-// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
-//
-// Currently, Values() supports from 1 to 50 parameters.
-//
-template <typename T1>
-internal::ValueArray1<T1> Values(T1 v1) {
-  return internal::ValueArray1<T1>(v1);
-}
-
-template <typename T1, typename T2>
-internal::ValueArray2<T1, T2> Values(T1 v1, T2 v2) {
-  return internal::ValueArray2<T1, T2>(v1, v2);
-}
-
-template <typename T1, typename T2, typename T3>
-internal::ValueArray3<T1, T2, T3> Values(T1 v1, T2 v2, T3 v3) {
-  return internal::ValueArray3<T1, T2, T3>(v1, v2, v3);
-}
-
-template <typename T1, typename T2, typename T3, typename T4>
-internal::ValueArray4<T1, T2, T3, T4> Values(T1 v1, T2 v2, T3 v3, T4 v4) {
-  return internal::ValueArray4<T1, T2, T3, T4>(v1, v2, v3, v4);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-internal::ValueArray5<T1, T2, T3, T4, T5> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5) {
-  return internal::ValueArray5<T1, T2, T3, T4, T5>(v1, v2, v3, v4, v5);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-internal::ValueArray6<T1, T2, T3, T4, T5, T6> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6) {
-  return internal::ValueArray6<T1, T2, T3, T4, T5, T6>(v1, v2, v3, v4, v5, v6);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6, T7 v7) {
-  return internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7>(v1, v2, v3, v4, v5,
-      v6, v7);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) {
-  return internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8>(v1, v2, v3, v4,
-      v5, v6, v7, v8);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) {
-  return internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(v1, v2, v3,
-      v4, v5, v6, v7, v8, v9);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> Values(T1 v1,
-    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) {
-  return internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>(v1,
-      v2, v3, v4, v5, v6, v7, v8, v9, v10);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
-    T11> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11) {
-  return internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
-      T11>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-    T12> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12) {
-  return internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-    T13> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13) {
-  return internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) {
-  return internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-      v14);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) {
-  return internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-      v13, v14, v15);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16) {
-  return internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
-      v12, v13, v14, v15, v16);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17) {
-  return internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
-      v11, v12, v13, v14, v15, v16, v17);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
-    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18) {
-  return internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
-      v10, v11, v12, v13, v14, v15, v16, v17, v18);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
-    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
-    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) {
-  return internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19>(v1, v2, v3, v4, v5, v6, v7, v8,
-      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) {
-  return internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20>(v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) {
-  return internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21>(v1, v2, v3, v4, v5, v6,
-      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22) {
-  return internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22>(v1, v2, v3, v4,
-      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23) {
-  return internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23>(v1, v2, v3,
-      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24) {
-  return internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24>(v1, v2,
-      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
-      v19, v20, v21, v22, v23, v24);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Values(T1 v1,
-    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
-    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
-    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) {
-  return internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25>(v1,
-      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
-      v18, v19, v20, v21, v22, v23, v24, v25);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-    T26> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26) {
-  return internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
-      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-    T27> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27) {
-  return internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
-      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-    T28> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28) {
-  return internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
-      v28);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29) {
-  return internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
-      v27, v28, v29);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
-    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
-    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) {
-  return internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
-      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
-      v26, v27, v28, v29, v30);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) {
-  return internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
-      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
-      v25, v26, v27, v28, v29, v30, v31);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32) {
-  return internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
-      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
-    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33) {
-  return internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33>(v1, v2, v3, v4, v5, v6, v7, v8,
-      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
-    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
-    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
-    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
-    T31 v31, T32 v32, T33 v33, T34 v34) {
-  return internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34>(v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
-      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
-    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
-    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) {
-  return internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35>(v1, v2, v3, v4, v5, v6,
-      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
-      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
-    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
-    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) {
-  return internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36>(v1, v2, v3, v4,
-      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
-      v34, v35, v36);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
-    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
-    T37 v37) {
-  return internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37>(v1, v2, v3,
-      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
-      v34, v35, v36, v37);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
-    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
-    T37 v37, T38 v38) {
-  return internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38>(v1, v2,
-      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
-      v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32,
-      v33, v34, v35, v36, v37, v38);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
-    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
-    T37 v37, T38 v38, T39 v39) {
-  return internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39>(v1,
-      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
-      v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
-      v32, v33, v34, v35, v36, v37, v38, v39);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Values(T1 v1,
-    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
-    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
-    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27,
-    T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35,
-    T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) {
-  return internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
-      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
-      v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-    T41> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) {
-  return internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
-      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28,
-      v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-    T42> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-    T42 v42) {
-  return internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
-      v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41,
-      v42);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-    T43> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-    T42 v42, T43 v43) {
-  return internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
-      v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40,
-      v41, v42, v43);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-    T42 v42, T43 v43, T44 v44) {
-  return internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
-      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
-      v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39,
-      v40, v41, v42, v43, v44);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
-    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
-    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
-    T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
-    T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) {
-  return internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
-      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
-      v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38,
-      v39, v40, v41, v42, v43, v44, v45);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
-    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) {
-  return internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
-      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
-      v38, v39, v40, v41, v42, v43, v44, v45, v46);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
-    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) {
-  return internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47>(v1, v2, v3, v4, v5, v6, v7, v8,
-      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
-      v38, v39, v40, v41, v42, v43, v44, v45, v46, v47);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
-    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
-    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47,
-    T48 v48) {
-  return internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47, T48>(v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
-      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36,
-      v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48, T49> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
-    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
-    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
-    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
-    T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38,
-    T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46,
-    T47 v47, T48 v48, T49 v49) {
-  return internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49>(v1, v2, v3, v4, v5, v6,
-      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
-      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35,
-      v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48, T49, T50> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
-    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
-    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37,
-    T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45,
-    T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) {
-  return internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>(v1, v2, v3, v4,
-      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
-      v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
-      v48, v49, v50);
-}
-
-// Bool() allows generating tests with parameters in a set of (false, true).
-//
-// Synopsis:
-// Bool()
-//   - returns a generator producing sequences with elements {false, true}.
-//
-// It is useful when testing code that depends on Boolean flags. Combinations
-// of multiple flags can be tested when several Bool()'s are combined using
-// Combine() function.
-//
-// In the following example all tests in the test case FlagDependentTest
-// will be instantiated twice with parameters false and true.
-//
-// class FlagDependentTest : public testing::TestWithParam<bool> {
-//   virtual void SetUp() {
-//     external_flag = GetParam();
-//   }
-// }
-// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool());
-//
-inline internal::ParamGenerator<bool> Bool() {
-  return Values(false, true);
-}
-
-# if GTEST_HAS_COMBINE
-// Combine() allows the user to combine two or more sequences to produce
-// values of a Cartesian product of those sequences' elements.
-//
-// Synopsis:
-// Combine(gen1, gen2, ..., genN)
-//   - returns a generator producing sequences with elements coming from
-//     the Cartesian product of elements from the sequences generated by
-//     gen1, gen2, ..., genN. The sequence elements will have a type of
-//     tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
-//     of elements from sequences produces by gen1, gen2, ..., genN.
-//
-// Combine can have up to 10 arguments. This number is currently limited
-// by the maximum number of elements in the tuple implementation used by Google
-// Test.
-//
-// Example:
-//
-// This will instantiate tests in test case AnimalTest each one with
-// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
-// tuple("dog", BLACK), and tuple("dog", WHITE):
-//
-// enum Color { BLACK, GRAY, WHITE };
-// class AnimalTest
-//     : public testing::TestWithParam<tuple<const char*, Color> > {...};
-//
-// TEST_P(AnimalTest, AnimalLooksNice) {...}
-//
-// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest,
-//                         Combine(Values("cat", "dog"),
-//                                 Values(BLACK, WHITE)));
-//
-// This will instantiate tests in FlagDependentTest with all variations of two
-// Boolean flags:
-//
-// class FlagDependentTest
-//     : public testing::TestWithParam<tuple<bool, bool> > {
-//   virtual void SetUp() {
-//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
-//     tie(external_flag_1, external_flag_2) = GetParam();
-//   }
-// };
-//
-// TEST_P(FlagDependentTest, TestFeature1) {
-//   // Test your code using external_flag_1 and external_flag_2 here.
-// }
-// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest,
-//                         Combine(Bool(), Bool()));
-//
-template <typename Generator1, typename Generator2>
-internal::CartesianProductHolder2<Generator1, Generator2> Combine(
-    const Generator1& g1, const Generator2& g2) {
-  return internal::CartesianProductHolder2<Generator1, Generator2>(
-      g1, g2);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3>
-internal::CartesianProductHolder3<Generator1, Generator2, Generator3> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3) {
-  return internal::CartesianProductHolder3<Generator1, Generator2, Generator3>(
-      g1, g2, g3);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4>
-internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
-    Generator4> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4) {
-  return internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
-      Generator4>(
-      g1, g2, g3, g4);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5>
-internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
-    Generator4, Generator5> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5) {
-  return internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
-      Generator4, Generator5>(
-      g1, g2, g3, g4, g5);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6>
-internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6) {
-  return internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6>(
-      g1, g2, g3, g4, g5, g6);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7>
-internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7) {
-  return internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7>(
-      g1, g2, g3, g4, g5, g6, g7);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7, typename Generator8>
-internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7, Generator8> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7, const Generator8& g8) {
-  return internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7, Generator8>(
-      g1, g2, g3, g4, g5, g6, g7, g8);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7, typename Generator8, typename Generator9>
-internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7, Generator8,
-    Generator9> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7, const Generator8& g8, const Generator9& g9) {
-  return internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9>(
-      g1, g2, g3, g4, g5, g6, g7, g8, g9);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7, typename Generator8, typename Generator9,
-    typename Generator10>
-internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
-    Generator10> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7, const Generator8& g8, const Generator9& g9,
-        const Generator10& g10) {
-  return internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
-      Generator10>(
-      g1, g2, g3, g4, g5, g6, g7, g8, g9, g10);
-}
-# endif  // GTEST_HAS_COMBINE
-
-
-
-# define TEST_P(test_case_name, test_name) \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
-      : public test_case_name { \
-   public: \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
-    virtual void TestBody(); \
-   private: \
-    static int AddToRegistry() { \
-      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
-          GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, __FILE__, __LINE__)->AddTestPattern(\
-                  #test_case_name, \
-                  #test_name, \
-                  new ::testing::internal::TestMetaFactory< \
-                      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
-      return 0; \
-    } \
-    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
-        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
-  }; \
-  int GTEST_TEST_CLASS_NAME_(test_case_name, \
-                             test_name)::gtest_registering_dummy_ = \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
-  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
-
-# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \
-  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
-      gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
-  int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
-          GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\
-                  #prefix, \
-                  &gtest_##prefix##test_case_name##_EvalGenerator_, \
-                  __FILE__, __LINE__)
-
-}  // namespace testing
-
-#endif  // GTEST_HAS_PARAM_TEST
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-// Copyright 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// Google C++ Testing Framework definitions useful in production code.
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-
-// When you need to test the private or protected members of a class,
-// use the FRIEND_TEST macro to declare your tests as friends of the
-// class.  For example:
-//
-// class MyClass {
-//  private:
-//   void MyMethod();
-//   FRIEND_TEST(MyClassTest, MyMethod);
-// };
-//
-// class MyClassTest : public testing::Test {
-//   // ...
-// };
-//
-// TEST_F(MyClassTest, MyMethod) {
-//   // Can call MyClass::MyMethod() here.
-// }
-
-#define FRIEND_TEST(test_case_name, test_name)\
-friend class test_case_name##_##test_name##_Test
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: mheule@google.com (Markus Heule)
-//
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-
-#include <iosfwd>
-#include <vector>
-
-namespace testing {
-
-// A copyable object representing the result of a test part (i.e. an
-// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
-//
-// Don't inherit from TestPartResult as its destructor is not virtual.
-class GTEST_API_ TestPartResult {
- public:
-  // The possible outcomes of a test part (i.e. an assertion or an
-  // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
-  enum Type {
-    kSuccess,          // Succeeded.
-    kNonFatalFailure,  // Failed but the test can continue.
-    kFatalFailure      // Failed and the test should be terminated.
-  };
-
-  // C'tor.  TestPartResult does NOT have a default constructor.
-  // Always use this constructor (with parameters) to create a
-  // TestPartResult object.
-  TestPartResult(Type a_type,
-                 const char* a_file_name,
-                 int a_line_number,
-                 const char* a_message)
-      : type_(a_type),
-        file_name_(a_file_name == NULL ? "" : a_file_name),
-        line_number_(a_line_number),
-        summary_(ExtractSummary(a_message)),
-        message_(a_message) {
-  }
-
-  // Gets the outcome of the test part.
-  Type type() const { return type_; }
-
-  // Gets the name of the source file where the test part took place, or
-  // NULL if it's unknown.
-  const char* file_name() const {
-    return file_name_.empty() ? NULL : file_name_.c_str();
-  }
-
-  // Gets the line in the source file where the test part took place,
-  // or -1 if it's unknown.
-  int line_number() const { return line_number_; }
-
-  // Gets the summary of the failure message.
-  const char* summary() const { return summary_.c_str(); }
-
-  // Gets the message associated with the test part.
-  const char* message() const { return message_.c_str(); }
-
-  // Returns true iff the test part passed.
-  bool passed() const { return type_ == kSuccess; }
-
-  // Returns true iff the test part failed.
-  bool failed() const { return type_ != kSuccess; }
-
-  // Returns true iff the test part non-fatally failed.
-  bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
-
-  // Returns true iff the test part fatally failed.
-  bool fatally_failed() const { return type_ == kFatalFailure; }
-
- private:
-  Type type_;
-
-  // Gets the summary of the failure message by omitting the stack
-  // trace in it.
-  static std::string ExtractSummary(const char* message);
-
-  // The name of the source file where the test part took place, or
-  // "" if the source file is unknown.
-  std::string file_name_;
-  // The line in the source file where the test part took place, or -1
-  // if the line number is unknown.
-  int line_number_;
-  std::string summary_;  // The test failure summary.
-  std::string message_;  // The test failure message.
-};
-
-// Prints a TestPartResult object.
-std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
-
-// An array of TestPartResult objects.
-//
-// Don't inherit from TestPartResultArray as its destructor is not
-// virtual.
-class GTEST_API_ TestPartResultArray {
- public:
-  TestPartResultArray() {}
-
-  // Appends the given TestPartResult to the array.
-  void Append(const TestPartResult& result);
-
-  // Returns the TestPartResult at the given index (0-based).
-  const TestPartResult& GetTestPartResult(int index) const;
-
-  // Returns the number of TestPartResult objects in the array.
-  int size() const;
-
- private:
-  std::vector<TestPartResult> array_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
-};
-
-// This interface knows how to report a test part result.
-class TestPartResultReporterInterface {
- public:
-  virtual ~TestPartResultReporterInterface() {}
-
-  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
-};
-
-namespace internal {
-
-// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
-// statement generates new fatal failures. To do so it registers itself as the
-// current test part result reporter. Besides checking if fatal failures were
-// reported, it only delegates the reporting to the former result reporter.
-// The original result reporter is restored in the destructor.
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-class GTEST_API_ HasNewFatalFailureHelper
-    : public TestPartResultReporterInterface {
- public:
-  HasNewFatalFailureHelper();
-  virtual ~HasNewFatalFailureHelper();
-  virtual void ReportTestPartResult(const TestPartResult& result);
-  bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
- private:
-  bool has_new_fatal_failure_;
-  TestPartResultReporterInterface* original_reporter_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
-};
-
-}  // namespace internal
-
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
-
-// This header implements typed tests and type-parameterized tests.
-
-// Typed (aka type-driven) tests repeat the same test for types in a
-// list.  You must know which types you want to test with when writing
-// typed tests. Here's how you do it:
-
-#if 0
-
-// First, define a fixture class template.  It should be parameterized
-// by a type.  Remember to derive it from testing::Test.
-template <typename T>
-class FooTest : public testing::Test {
- public:
-  ...
-  typedef std::list<T> List;
-  static T shared_;
-  T value_;
-};
-
-// Next, associate a list of types with the test case, which will be
-// repeated for each type in the list.  The typedef is necessary for
-// the macro to parse correctly.
-typedef testing::Types<char, int, unsigned int> MyTypes;
-TYPED_TEST_CASE(FooTest, MyTypes);
-
-// If the type list contains only one type, you can write that type
-// directly without Types<...>:
-//   TYPED_TEST_CASE(FooTest, int);
-
-// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
-// tests for this test case as you want.
-TYPED_TEST(FooTest, DoesBlah) {
-  // Inside a test, refer to TypeParam to get the type parameter.
-  // Since we are inside a derived class template, C++ requires use to
-  // visit the members of FooTest via 'this'.
-  TypeParam n = this->value_;
-
-  // To visit static members of the fixture, add the TestFixture::
-  // prefix.
-  n += TestFixture::shared_;
-
-  // To refer to typedefs in the fixture, add the "typename
-  // TestFixture::" prefix.
-  typename TestFixture::List values;
-  values.push_back(n);
-  ...
-}
-
-TYPED_TEST(FooTest, HasPropertyA) { ... }
-
-#endif  // 0
-
-// Type-parameterized tests are abstract test patterns parameterized
-// by a type.  Compared with typed tests, type-parameterized tests
-// allow you to define the test pattern without knowing what the type
-// parameters are.  The defined pattern can be instantiated with
-// different types any number of times, in any number of translation
-// units.
-//
-// If you are designing an interface or concept, you can define a
-// suite of type-parameterized tests to verify properties that any
-// valid implementation of the interface/concept should have.  Then,
-// each implementation can easily instantiate the test suite to verify
-// that it conforms to the requirements, without having to write
-// similar tests repeatedly.  Here's an example:
-
-#if 0
-
-// First, define a fixture class template.  It should be parameterized
-// by a type.  Remember to derive it from testing::Test.
-template <typename T>
-class FooTest : public testing::Test {
-  ...
-};
-
-// Next, declare that you will define a type-parameterized test case
-// (the _P suffix is for "parameterized" or "pattern", whichever you
-// prefer):
-TYPED_TEST_CASE_P(FooTest);
-
-// Then, use TYPED_TEST_P() to define as many type-parameterized tests
-// for this type-parameterized test case as you want.
-TYPED_TEST_P(FooTest, DoesBlah) {
-  // Inside a test, refer to TypeParam to get the type parameter.
-  TypeParam n = 0;
-  ...
-}
-
-TYPED_TEST_P(FooTest, HasPropertyA) { ... }
-
-// Now the tricky part: you need to register all test patterns before
-// you can instantiate them.  The first argument of the macro is the
-// test case name; the rest are the names of the tests in this test
-// case.
-REGISTER_TYPED_TEST_CASE_P(FooTest,
-                           DoesBlah, HasPropertyA);
-
-// Finally, you are free to instantiate the pattern with the types you
-// want.  If you put the above code in a header file, you can #include
-// it in multiple C++ source files and instantiate it multiple times.
-//
-// To distinguish different instances of the pattern, the first
-// argument to the INSTANTIATE_* macro is a prefix that will be added
-// to the actual test case name.  Remember to pick unique prefixes for
-// different instances.
-typedef testing::Types<char, int, unsigned int> MyTypes;
-INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
-
-// If the type list contains only one type, you can write that type
-// directly without Types<...>:
-//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int);
-
-#endif  // 0
-
-
-// Implements typed tests.
-
-#if GTEST_HAS_TYPED_TEST
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Expands to the name of the typedef for the type parameters of the
-// given test case.
-# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_
-
-// The 'Types' template argument below must have spaces around it
-// since some compilers may choke on '>>' when passing a template
-// instance (e.g. Types<int>)
-# define TYPED_TEST_CASE(CaseName, Types) \
-  typedef ::testing::internal::TypeList< Types >::type \
-      GTEST_TYPE_PARAMS_(CaseName)
-
-# define TYPED_TEST(CaseName, TestName) \
-  template <typename gtest_TypeParam_> \
-  class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \
-      : public CaseName<gtest_TypeParam_> { \
-   private: \
-    typedef CaseName<gtest_TypeParam_> TestFixture; \
-    typedef gtest_TypeParam_ TypeParam; \
-    virtual void TestBody(); \
-  }; \
-  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::internal::TypeParameterizedTest< \
-          CaseName, \
-          ::testing::internal::TemplateSel< \
-              GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \
-          GTEST_TYPE_PARAMS_(CaseName)>::Register(\
-              "", #CaseName, #TestName, 0); \
-  template <typename gtest_TypeParam_> \
-  void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody()
-
-#endif  // GTEST_HAS_TYPED_TEST
-
-// Implements type-parameterized tests.
-
-#if GTEST_HAS_TYPED_TEST_P
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Expands to the namespace name that the type-parameterized tests for
-// the given type-parameterized test case are defined in.  The exact
-// name of the namespace is subject to change without notice.
-# define GTEST_CASE_NAMESPACE_(TestCaseName) \
-  gtest_case_##TestCaseName##_
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Expands to the name of the variable used to remember the names of
-// the defined tests in the given test case.
-# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \
-  gtest_typed_test_case_p_state_##TestCaseName##_
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
-//
-// Expands to the name of the variable used to remember the names of
-// the registered tests in the given test case.
-# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \
-  gtest_registered_test_names_##TestCaseName##_
-
-// The variables defined in the type-parameterized test macros are
-// static as typically these macros are used in a .h file that can be
-// #included in multiple translation units linked together.
-# define TYPED_TEST_CASE_P(CaseName) \
-  static ::testing::internal::TypedTestCasePState \
-      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName)
-
-# define TYPED_TEST_P(CaseName, TestName) \
-  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
-  template <typename gtest_TypeParam_> \
-  class TestName : public CaseName<gtest_TypeParam_> { \
-   private: \
-    typedef CaseName<gtest_TypeParam_> TestFixture; \
-    typedef gtest_TypeParam_ TypeParam; \
-    virtual void TestBody(); \
-  }; \
-  static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
-      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\
-          __FILE__, __LINE__, #CaseName, #TestName); \
-  } \
-  template <typename gtest_TypeParam_> \
-  void GTEST_CASE_NAMESPACE_(CaseName)::TestName<gtest_TypeParam_>::TestBody()
-
-# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \
-  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
-  typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
-  } \
-  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \
-      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\
-          __FILE__, __LINE__, #__VA_ARGS__)
-
-// The 'Types' template argument below must have spaces around it
-// since some compilers may choke on '>>' when passing a template
-// instance (e.g. Types<int>)
-# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \
-  bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::internal::TypeParameterizedTestCase<CaseName, \
-          GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \
-          ::testing::internal::TypeList< Types >::type>::Register(\
-              #Prefix, #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
-
-#endif  // GTEST_HAS_TYPED_TEST_P
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+#include "gtest/gtest-death-test.h"
+#include "gtest/gtest-message.h"
+#include "gtest/gtest-param-test.h"
+#include "gtest/gtest-printers.h"
+#include "gtest/gtest_prod.h"
+#include "gtest/gtest-test-part.h"
+#include "gtest/gtest-typed-test.h"
 
 // Depending on the platform, different string classes are available.
 // On Linux, in addition to ::std::string, Google also makes use of
 // class ::string, which has the same interface as ::std::string, but
 // has a different implementation.
 //
-// The user can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that
+// You can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that
 // ::string is available AND is a distinct type to ::std::string, or
 // define it to 0 to indicate otherwise.
 //
-// If the user's ::std::string and ::string are the same class due to
-// aliasing, he should define GTEST_HAS_GLOBAL_STRING to 0.
+// If ::std::string and ::string are the same class on your platform
+// due to aliasing, you should define GTEST_HAS_GLOBAL_STRING to 0.
 //
-// If the user doesn't define GTEST_HAS_GLOBAL_STRING, it is defined
+// If you do not define GTEST_HAS_GLOBAL_STRING, it is defined
 // heuristically.
 
 namespace testing {
@@ -17671,8 +258,31 @@ class GTEST_API_ AssertionResult {
   // Copy constructor.
   // Used in EXPECT_TRUE/FALSE(assertion_result).
   AssertionResult(const AssertionResult& other);
+
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+
   // Used in the EXPECT_TRUE/FALSE(bool_expression).
-  explicit AssertionResult(bool success) : success_(success) {}
+  //
+  // T must be contextually convertible to bool.
+  //
+  // The second parameter prevents this overload from being considered if
+  // the argument is implicitly convertible to AssertionResult. In that case
+  // we want AssertionResult's copy constructor to be used.
+  template <typename T>
+  explicit AssertionResult(
+      const T& success,
+      typename internal::EnableIf<
+          !internal::ImplicitlyConvertible<T, AssertionResult>::value>::type*
+          /*enabler*/ = NULL)
+      : success_(success) {}
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+  // Assignment operator.
+  AssertionResult& operator=(AssertionResult other) {
+    swap(other);
+    return *this;
+  }
 
   // Returns true iff the assertion succeeded.
   operator bool() const { return success_; }  // NOLINT
@@ -17713,6 +323,9 @@ class GTEST_API_ AssertionResult {
     message_->append(a_message.GetString().c_str());
   }
 
+  // Swap the contents of this AssertionResult with other.
+  void swap(AssertionResult& other);
+
   // Stores result of the assertion predicate.
   bool success_;
   // Stores the message describing the condition in case the expectation
@@ -17720,8 +333,6 @@ class GTEST_API_ AssertionResult {
   // Referenced via a pointer to avoid taking too much stack frame space
   // with test assertions.
   internal::scoped_ptr< ::std::string> message_;
-
-  GTEST_DISALLOW_ASSIGN_(AssertionResult);
 };
 
 // Makes a successful assertion result.
@@ -17748,8 +359,8 @@ GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
 //
 //   class FooTest : public testing::Test {
 //    protected:
-//     virtual void SetUp() { ... }
-//     virtual void TearDown() { ... }
+//     void SetUp() override { ... }
+//     void TearDown() override { ... }
 //     ...
 //   };
 //
@@ -17841,20 +452,19 @@ class GTEST_API_ Test {
   // internal method to avoid clashing with names used in user TESTs.
   void DeleteSelf_() { delete this; }
 
-  // Uses a GTestFlagSaver to save and restore all Google Test flags.
-  const internal::GTestFlagSaver* const gtest_flag_saver_;
+  const internal::scoped_ptr< GTEST_FLAG_SAVER_ > gtest_flag_saver_;
 
-  // Often a user mis-spells SetUp() as Setup() and spends a long time
+  // Often a user misspells SetUp() as Setup() and spends a long time
   // wondering why it is never called by Google Test.  The declaration of
   // the following method is solely for catching such an error at
   // compile time:
   //
   //   - The return type is deliberately chosen to be not void, so it
-  //   will be a conflict if a user declares void Setup() in his test
-  //   fixture.
+  //   will be a conflict if void Setup() is declared in the user's
+  //   test fixture.
   //
   //   - This method is private, so it will be another compiler error
-  //   if a user calls it from his test fixture.
+  //   if the method is called from the user's test fixture.
   //
   // DO NOT OVERRIDE THIS FUNCTION.
   //
@@ -18059,6 +669,12 @@ class GTEST_API_ TestInfo {
     return NULL;
   }
 
+  // Returns the file name where this test is defined.
+  const char* file() const { return location_.file.c_str(); }
+
+  // Returns the line where this test is defined.
+  int line() const { return location_.line; }
+
   // Returns true if this test should run, that is if the test is not
   // disabled (or it is disabled but the also_run_disabled_tests flag has
   // been specified) and its full name matches the user-specified filter.
@@ -18101,6 +717,7 @@ class GTEST_API_ TestInfo {
       const char* name,
       const char* type_param,
       const char* value_param,
+      internal::CodeLocation code_location,
       internal::TypeId fixture_class_id,
       Test::SetUpTestCaseFunc set_up_tc,
       Test::TearDownTestCaseFunc tear_down_tc,
@@ -18112,6 +729,7 @@ class GTEST_API_ TestInfo {
            const std::string& name,
            const char* a_type_param,   // NULL if not a type-parameterized test
            const char* a_value_param,  // NULL if not a value-parameterized test
+           internal::CodeLocation a_code_location,
            internal::TypeId fixture_class_id,
            internal::TestFactoryBase* factory);
 
@@ -18138,6 +756,7 @@ class GTEST_API_ TestInfo {
   // Text representation of the value parameter, or NULL if this is not a
   // value-parameterized test.
   const internal::scoped_ptr<const ::std::string> value_param_;
+  internal::CodeLocation location_;
   const internal::TypeId fixture_class_id_;   // ID of the test fixture class
   bool should_run_;                 // True iff this test should run
   bool is_disabled_;                // True iff this test is disabled
@@ -18337,7 +956,7 @@ class GTEST_API_ TestCase {
 };
 
 // An Environment object is capable of setting up and tearing down an
-// environment.  The user should subclass this to define his own
+// environment.  You should subclass this to define your own
 // environment(s).
 //
 // An Environment object does the set-up and tear-down in virtual
@@ -18749,137 +1368,42 @@ GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
 
 namespace internal {
 
-// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
-// value of type ToPrint that is an operand of a comparison assertion
-// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
-// the comparison, and is used to help determine the best way to
-// format the value.  In particular, when the value is a C string
-// (char pointer) and the other operand is an STL string object, we
-// want to format the C string as a string, since we know it is
-// compared by value with the string object.  If the value is a char
-// pointer but the other operand is not an STL string object, we don't
-// know whether the pointer is supposed to point to a NUL-terminated
-// string, and thus want to print it as a pointer to be safe.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-
-// The default case.
-template <typename ToPrint, typename OtherOperand>
-class FormatForComparison {
- public:
-  static ::std::string Format(const ToPrint& value) {
-    return ::testing::PrintToString(value);
-  }
-};
-
-// Array.
-template <typename ToPrint, size_t N, typename OtherOperand>
-class FormatForComparison<ToPrint[N], OtherOperand> {
- public:
-  static ::std::string Format(const ToPrint* value) {
-    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
-  }
-};
-
-// By default, print C string as pointers to be safe, as we don't know
-// whether they actually point to a NUL-terminated string.
-
-#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
-  template <typename OtherOperand>                                      \
-  class FormatForComparison<CharType*, OtherOperand> {                  \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(static_cast<const void*>(value)); \
-    }                                                                   \
-  }
-
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
-
-#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
-
-// If a C string is compared with an STL string object, we know it's meant
-// to point to a NUL-terminated string, and thus can print it as a string.
-
-#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
-  template <>                                                           \
-  class FormatForComparison<CharType*, OtherStringType> {               \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(value);                           \
-    }                                                                   \
-  }
-
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
-
-#if GTEST_HAS_GLOBAL_STRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
-#endif
-
-#if GTEST_HAS_GLOBAL_WSTRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
-#endif
-
-#if GTEST_HAS_STD_WSTRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
-#endif
-
-#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
-
-// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
-// operand to be used in a failure message.  The type (but not value)
-// of the other operand may affect the format.  This allows us to
-// print a char* as a raw pointer when it is compared against another
-// char* or void*, and print it as a C string when it is compared
-// against an std::string object, for example.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_* in a tight loop.
 template <typename T1, typename T2>
-std::string FormatForComparisonFailureMessage(
-    const T1& value, const T2& /* other_operand */) {
-  return FormatForComparison<T1, T2>::Format(value);
+AssertionResult CmpHelperEQFailure(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const T1& lhs, const T2& rhs) {
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs),
+                   false);
 }
 
 // The helper function for {ASSERT|EXPECT}_EQ.
 template <typename T1, typename T2>
-AssertionResult CmpHelperEQ(const char* expected_expression,
-                            const char* actual_expression,
-                            const T1& expected,
-                            const T2& actual) {
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4389)  // Temporarily disables warning on
-                                // signed/unsigned mismatch.
-#endif
-
-  if (expected == actual) {
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            const T1& lhs,
+                            const T2& rhs) {
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4389 /* signed/unsigned mismatch */)
+  if (lhs == rhs) {
     return AssertionSuccess();
   }
+GTEST_DISABLE_MSC_WARNINGS_POP_()
 
-#ifdef _MSC_VER
-# pragma warning(pop)          // Restores the warning state.
-#endif
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   FormatForComparisonFailureMessage(expected, actual),
-                   FormatForComparisonFailureMessage(actual, expected),
-                   false);
+  return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
 }
 
 // With this overloaded version, we allow anonymous enums to be used
 // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
 // can be implicitly cast to BiggestInt.
-GTEST_API_ AssertionResult CmpHelperEQ(const char* expected_expression,
-                                       const char* actual_expression,
-                                       BiggestInt expected,
-                                       BiggestInt actual);
+GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression,
+                                       const char* rhs_expression,
+                                       BiggestInt lhs,
+                                       BiggestInt rhs);
 
 // The helper class for {ASSERT|EXPECT}_EQ.  The template argument
 // lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
@@ -18890,12 +1414,11 @@ class EqHelper {
  public:
   // This templatized version is for the general case.
   template <typename T1, typename T2>
-  static AssertionResult Compare(const char* expected_expression,
-                                 const char* actual_expression,
-                                 const T1& expected,
-                                 const T2& actual) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression,
+                                 const T1& lhs,
+                                 const T2& rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
 
   // With this overloaded version, we allow anonymous enums to be used
@@ -18904,12 +1427,11 @@ class EqHelper {
   //
   // Even though its body looks the same as the above version, we
   // cannot merge the two, as it will make anonymous enums unhappy.
-  static AssertionResult Compare(const char* expected_expression,
-                                 const char* actual_expression,
-                                 BiggestInt expected,
-                                 BiggestInt actual) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression,
+                                 BiggestInt lhs,
+                                 BiggestInt rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
 };
 
@@ -18924,40 +1446,52 @@ class EqHelper<true> {
   // EXPECT_EQ(false, a_bool).
   template <typename T1, typename T2>
   static AssertionResult Compare(
-      const char* expected_expression,
-      const char* actual_expression,
-      const T1& expected,
-      const T2& actual,
+      const char* lhs_expression,
+      const char* rhs_expression,
+      const T1& lhs,
+      const T2& rhs,
       // The following line prevents this overload from being considered if T2
       // is not a pointer type.  We need this because ASSERT_EQ(NULL, my_ptr)
       // expands to Compare("", "", NULL, my_ptr), which requires a conversion
       // to match the Secret* in the other overload, which would otherwise make
       // this template match better.
       typename EnableIf<!is_pointer<T2>::value>::type* = 0) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
 
   // This version will be picked when the second argument to ASSERT_EQ() is a
   // pointer, e.g. ASSERT_EQ(NULL, a_pointer).
   template <typename T>
   static AssertionResult Compare(
-      const char* expected_expression,
-      const char* actual_expression,
+      const char* lhs_expression,
+      const char* rhs_expression,
       // We used to have a second template parameter instead of Secret*.  That
       // template parameter would deduce to 'long', making this a better match
       // than the first overload even without the first overload's EnableIf.
       // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to
       // non-pointer argument" (even a deduced integral argument), so the old
       // implementation caused warnings in user code.
-      Secret* /* expected (NULL) */,
-      T* actual) {
-    // We already know that 'expected' is a null pointer.
-    return CmpHelperEQ(expected_expression, actual_expression,
-                       static_cast<T*>(NULL), actual);
+      Secret* /* lhs (NULL) */,
+      T* rhs) {
+    // We already know that 'lhs' is a null pointer.
+    return CmpHelperEQ(lhs_expression, rhs_expression,
+                       static_cast<T*>(NULL), rhs);
   }
 };
 
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_OP in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
+                                   const T1& val1, const T2& val2,
+                                   const char* op) {
+  return AssertionFailure()
+         << "Expected: (" << expr1 << ") " << op << " (" << expr2
+         << "), actual: " << FormatForComparisonFailureMessage(val1, val2)
+         << " vs " << FormatForComparisonFailureMessage(val2, val1);
+}
+
 // A macro for implementing the helper functions needed to implement
 // ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
 // of similar code.
@@ -18968,6 +1502,7 @@ class EqHelper<true> {
 // with gcc 4.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
 #define GTEST_IMPL_CMP_HELPER_(op_name, op)\
 template <typename T1, typename T2>\
 AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
@@ -18975,10 +1510,7 @@ AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
   if (val1 op val2) {\
     return AssertionSuccess();\
   } else {\
-    return AssertionFailure() \
-        << "Expected: (" << expr1 << ") " #op " (" << expr2\
-        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
-        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
+    return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
   }\
 }\
 GTEST_API_ AssertionResult CmpHelper##op_name(\
@@ -19002,18 +1534,18 @@ GTEST_IMPL_CMP_HELPER_(GT, >);
 // The helper function for {ASSERT|EXPECT}_STREQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                                          const char* actual_expression,
-                                          const char* expected,
-                                          const char* actual);
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
-                                              const char* actual_expression,
-                                              const char* expected,
-                                              const char* actual);
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 //
@@ -19035,10 +1567,10 @@ GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
 // Helper function for *_STREQ on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                                          const char* actual_expression,
-                                          const wchar_t* expected,
-                                          const wchar_t* actual);
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
 
 // Helper function for *_STRNE on wide strings.
 //
@@ -19096,28 +1628,28 @@ namespace internal {
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 template <typename RawType>
-AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression,
-                                         const char* actual_expression,
-                                         RawType expected,
-                                         RawType actual) {
-  const FloatingPoint<RawType> lhs(expected), rhs(actual);
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         RawType lhs_value,
+                                         RawType rhs_value) {
+  const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
 
   if (lhs.AlmostEquals(rhs)) {
     return AssertionSuccess();
   }
 
-  ::std::stringstream expected_ss;
-  expected_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-              << expected;
+  ::std::stringstream lhs_ss;
+  lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << lhs_value;
 
-  ::std::stringstream actual_ss;
-  actual_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-            << actual;
+  ::std::stringstream rhs_ss;
+  rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << rhs_value;
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   StringStreamToString(&expected_ss),
-                   StringStreamToString(&actual_ss),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   StringStreamToString(&lhs_ss),
+                   StringStreamToString(&rhs_ss),
                    false);
 }
 
@@ -19325,13 +1857,13 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // AssertionResult. For more information on how to use AssertionResult with
 // these macros see comments on that class.
 #define EXPECT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+  GTEST_TEST_BOOLEAN_((condition), #condition, false, true, \
                       GTEST_NONFATAL_FAILURE_)
 #define EXPECT_FALSE(condition) \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_NONFATAL_FAILURE_)
 #define ASSERT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+  GTEST_TEST_BOOLEAN_((condition), #condition, false, true, \
                       GTEST_FATAL_FAILURE_)
 #define ASSERT_FALSE(condition) \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
@@ -19339,373 +1871,16 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 
 // Includes the auto-generated header that implements a family of
 // generic predicate assertion macros.
-// Copyright 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command
-// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
-//
-// Implements a family of generic predicate assertion macros.
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-
-// Makes sure this header is not included before gtest.h.
-#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
-# error Do not include gtest_pred_impl.h directly.  Include gtest.h instead.
-#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
-
-// This header implements a family of generic predicate assertion
-// macros:
-//
-//   ASSERT_PRED_FORMAT1(pred_format, v1)
-//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
-//   ...
-//
-// where pred_format is a function or functor that takes n (in the
-// case of ASSERT_PRED_FORMATn) values and their source expression
-// text, and returns a testing::AssertionResult.  See the definition
-// of ASSERT_EQ in gtest.h for an example.
-//
-// If you don't care about formatting, you can use the more
-// restrictive version:
-//
-//   ASSERT_PRED1(pred, v1)
-//   ASSERT_PRED2(pred, v1, v2)
-//   ...
-//
-// where pred is an n-ary function or functor that returns bool,
-// and the values v1, v2, ..., must support the << operator for
-// streaming to std::ostream.
-//
-// We also define the EXPECT_* variations.
-//
-// For now we only support predicates whose arity is at most 5.
-// Please email googletestframework@googlegroups.com if you need
-// support for higher arities.
-
-// GTEST_ASSERT_ is the basic statement to which all of the assertions
-// in this file reduce.  Don't use this in your code.
-
-#define GTEST_ASSERT_(expression, on_failure) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar = (expression)) \
-    ; \
-  else \
-    on_failure(gtest_ar.failure_message())
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1>
-AssertionResult AssertPred1Helper(const char* pred_text,
-                                  const char* e1,
-                                  Pred pred,
-                                  const T1& v1) {
-  if (pred(v1)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, v1), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
-// this in your code.
-#define GTEST_PRED1_(pred, v1, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
-                                             #v1, \
-                                             pred, \
-                                             v1), on_failure)
-
-// Unary predicate assertion macros.
-#define EXPECT_PRED_FORMAT1(pred_format, v1) \
-  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT1(pred_format, v1) \
-  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2>
-AssertionResult AssertPred2Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2) {
-  if (pred(v1, v2)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
-// this in your code.
-#define GTEST_PRED2_(pred, v1, v2, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             pred, \
-                                             v1, \
-                                             v2), on_failure)
-
-// Binary predicate assertion macros.
-#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
-  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED2(pred, v1, v2) \
-  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
-  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED2(pred, v1, v2) \
-  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3>
-AssertionResult AssertPred3Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3) {
-  if (pred(v1, v2, v3)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
-// this in your code.
-#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3), on_failure)
-
-// Ternary predicate assertion macros.
-#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
-  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED3(pred, v1, v2, v3) \
-  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
-  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED3(pred, v1, v2, v3) \
-  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4>
-AssertionResult AssertPred4Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4) {
-  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ", "
-                            << e4 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3
-                            << "\n" << e4 << " evaluates to " << v4;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
-// this in your code.
-#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4), on_failure)
-
-// 4-ary predicate assertion macros.
-#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
-  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
-  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
-  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
-  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5>
-AssertionResult AssertPred5Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  const char* e5,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4,
-                                  const T5& v5) {
-  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ", "
-                            << e4 << ", "
-                            << e5 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3
-                            << "\n" << e4 << " evaluates to " << v4
-                            << "\n" << e5 << " evaluates to " << v5;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
-// this in your code.
-#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             #v5, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4, \
-                                             v5), on_failure)
-
-// 5-ary predicate assertion macros.
-#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
-  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
-  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
-  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
-  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
-
-
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#include "gtest/gtest_pred_impl.h"
 
 // Macros for testing equalities and inequalities.
 //
-//    * {ASSERT|EXPECT}_EQ(expected, actual): Tests that expected == actual
-//    * {ASSERT|EXPECT}_NE(v1, v2):           Tests that v1 != v2
-//    * {ASSERT|EXPECT}_LT(v1, v2):           Tests that v1 < v2
-//    * {ASSERT|EXPECT}_LE(v1, v2):           Tests that v1 <= v2
-//    * {ASSERT|EXPECT}_GT(v1, v2):           Tests that v1 > v2
-//    * {ASSERT|EXPECT}_GE(v1, v2):           Tests that v1 >= v2
+//    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
+//    * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2
+//    * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2
+//    * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2
+//    * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2
+//    * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2
 //
 // When they are not, Google Test prints both the tested expressions and
 // their actual values.  The values must be compatible built-in types,
@@ -19727,8 +1902,8 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 //   are related, not how their content is related.  To compare two C
 //   strings by content, use {ASSERT|EXPECT}_STR*().
 //
-//   3. {ASSERT|EXPECT}_EQ(expected, actual) is preferred to
-//   {ASSERT|EXPECT}_TRUE(expected == actual), as the former tells you
+//   3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to
+//   {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you
 //   what the actual value is when it fails, and similarly for the
 //   other comparisons.
 //
@@ -19744,12 +1919,12 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 //   ASSERT_LT(i, array_size);
 //   ASSERT_GT(records.size(), 0) << "There is no record left.";
 
-#define EXPECT_EQ(expected, actual) \
+#define EXPECT_EQ(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal:: \
-                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
-                      expected, actual)
-#define EXPECT_NE(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, expected, actual)
+                      EqHelper<GTEST_IS_NULL_LITERAL_(val1)>::Compare, \
+                      val1, val2)
+#define EXPECT_NE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
 #define EXPECT_LE(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
 #define EXPECT_LT(val1, val2) \
@@ -19759,10 +1934,10 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 #define EXPECT_GT(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
 
-#define GTEST_ASSERT_EQ(expected, actual) \
+#define GTEST_ASSERT_EQ(val1, val2) \
   ASSERT_PRED_FORMAT2(::testing::internal:: \
-                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
-                      expected, actual)
+                      EqHelper<GTEST_IS_NULL_LITERAL_(val1)>::Compare, \
+                      val1, val2)
 #define GTEST_ASSERT_NE(val1, val2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
 #define GTEST_ASSERT_LE(val1, val2) \
@@ -19817,29 +1992,29 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 //
 // These macros evaluate their arguments exactly once.
 
-#define EXPECT_STREQ(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define EXPECT_STREQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
 #define EXPECT_STRNE(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
-#define EXPECT_STRCASEEQ(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define EXPECT_STRCASEEQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
 #define EXPECT_STRCASENE(s1, s2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
-#define ASSERT_STREQ(expected, actual) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define ASSERT_STREQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
 #define ASSERT_STRNE(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
-#define ASSERT_STRCASEEQ(expected, actual) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define ASSERT_STRCASEEQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
 #define ASSERT_STRCASENE(s1, s2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 // Macros for comparing floating-point numbers.
 //
-//    * {ASSERT|EXPECT}_FLOAT_EQ(expected, actual):
+//    * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2):
 //         Tests that two float values are almost equal.
-//    * {ASSERT|EXPECT}_DOUBLE_EQ(expected, actual):
+//    * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2):
 //         Tests that two double values are almost equal.
 //    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
 //         Tests that v1 and v2 are within the given distance to each other.
@@ -19849,21 +2024,21 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 // FloatingPoint template class in gtest-internal.h if you are
 // interested in the implementation details.
 
-#define EXPECT_FLOAT_EQ(expected, actual)\
+#define EXPECT_FLOAT_EQ(val1, val2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
-                      expected, actual)
+                      val1, val2)
 
-#define EXPECT_DOUBLE_EQ(expected, actual)\
+#define EXPECT_DOUBLE_EQ(val1, val2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
-                      expected, actual)
+                      val1, val2)
 
-#define ASSERT_FLOAT_EQ(expected, actual)\
+#define ASSERT_FLOAT_EQ(val1, val2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
-                      expected, actual)
+                      val1, val2)
 
-#define ASSERT_DOUBLE_EQ(expected, actual)\
+#define ASSERT_DOUBLE_EQ(val1, val2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
-                      expected, actual)
+                      val1, val2)
 
 #define EXPECT_NEAR(val1, val2, abs_error)\
   EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
@@ -19985,8 +2160,8 @@ bool StaticAssertTypeEq() {
 // The convention is to end the test case name with "Test".  For
 // example, a test case for the Foo class can be named FooTest.
 //
-// The user should put his test code between braces after using this
-// macro.  Example:
+// Test code should appear between braces after an invocation of
+// this macro.  Example:
 //
 //   TEST(FooTest, InitializesCorrectly) {
 //     Foo foo;
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h
new file mode 100644
index 0000000000..30ae712f50
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h
@@ -0,0 +1,358 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command
+// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
+//
+// Implements a family of generic predicate assertion macros.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+// Makes sure this header is not included before gtest.h.
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+# error Do not include gtest_pred_impl.h directly.  Include gtest.h instead.
+#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
+
+// This header implements a family of generic predicate assertion
+// macros:
+//
+//   ASSERT_PRED_FORMAT1(pred_format, v1)
+//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
+//   ...
+//
+// where pred_format is a function or functor that takes n (in the
+// case of ASSERT_PRED_FORMATn) values and their source expression
+// text, and returns a testing::AssertionResult.  See the definition
+// of ASSERT_EQ in gtest.h for an example.
+//
+// If you don't care about formatting, you can use the more
+// restrictive version:
+//
+//   ASSERT_PRED1(pred, v1)
+//   ASSERT_PRED2(pred, v1, v2)
+//   ...
+//
+// where pred is an n-ary function or functor that returns bool,
+// and the values v1, v2, ..., must support the << operator for
+// streaming to std::ostream.
+//
+// We also define the EXPECT_* variations.
+//
+// For now we only support predicates whose arity is at most 5.
+// Please email googletestframework@googlegroups.com if you need
+// support for higher arities.
+
+// GTEST_ASSERT_ is the basic statement to which all of the assertions
+// in this file reduce.  Don't use this in your code.
+
+#define GTEST_ASSERT_(expression, on_failure) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar = (expression)) \
+    ; \
+  else \
+    on_failure(gtest_ar.failure_message())
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text,
+                                  const char* e1,
+                                  Pred pred,
+                                  const T1& v1) {
+  if (pred(v1)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, v1), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+#define GTEST_PRED1_(pred, v1, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
+                                             #v1, \
+                                             pred, \
+                                             v1), on_failure)
+
+// Unary predicate assertion macros.
+#define EXPECT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2) {
+  if (pred(v1, v2)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+#define GTEST_PRED2_(pred, v1, v2, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             pred, \
+                                             v1, \
+                                             v2), on_failure)
+
+// Binary predicate assertion macros.
+#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3) {
+  if (pred(v1, v2, v3)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3), on_failure)
+
+// Ternary predicate assertion macros.
+#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4) {
+  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4), on_failure)
+
+// 4-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5>
+AssertionResult AssertPred5Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  const char* e5,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4,
+                                  const T5& v5) {
+  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ", "
+                            << e5 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4
+                            << "\n" << e5 << " evaluates to " << v5;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             #v5, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4, \
+                                             v5), on_failure)
+
+// 5-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+
+
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h
new file mode 100644
index 0000000000..da80ddc6c7
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h
@@ -0,0 +1,58 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// Google C++ Testing Framework definitions useful in production code.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class.  For example:
+//
+// class MyClass {
+//  private:
+//   void MyMethod();
+//   FRIEND_TEST(MyClassTest, MyMethod);
+// };
+//
+// class MyClassTest : public testing::Test {
+//   // ...
+// };
+//
+// TEST_F(MyClassTest, MyMethod) {
+//   // Can call MyClass::MyMethod() here.
+// }
+
+#define FRIEND_TEST(test_case_name, test_name)\
+friend class test_case_name##_##test_name##_Test
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
new file mode 100644
index 0000000000..7e744bd3bb
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
@@ -0,0 +1,69 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations.
+// The following macros can be defined:
+//
+//   Flag related macros:
+//     GTEST_FLAG(flag_name)
+//     GTEST_USE_OWN_FLAGFILE_FLAG_  - Define to 0 when the system provides its
+//                                     own flagfile flag parsing.
+//     GTEST_DECLARE_bool_(name)
+//     GTEST_DECLARE_int32_(name)
+//     GTEST_DECLARE_string_(name)
+//     GTEST_DEFINE_bool_(name, default_val, doc)
+//     GTEST_DEFINE_int32_(name, default_val, doc)
+//     GTEST_DEFINE_string_(name, default_val, doc)
+//
+//   Test filtering:
+//     GTEST_TEST_FILTER_ENV_VAR_ - The name of an environment variable that
+//                                  will be used if --GTEST_FLAG(test_filter)
+//                                  is not provided.
+//
+//   Logging:
+//     GTEST_LOG_(severity)
+//     GTEST_CHECK_(condition)
+//     Functions LogToStderr() and FlushInfoLog() have to be provided too.
+//
+//   Threading:
+//     GTEST_HAS_NOTIFICATION_ - Enabled if Notification is already provided.
+//     GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ - Enabled if Mutex and ThreadLocal are
+//                                         already provided.
+//     Must also provide GTEST_DECLARE_STATIC_MUTEX_(mutex) and
+//     GTEST_DEFINE_STATIC_MUTEX_(mutex)
+//
+//     GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+//     GTEST_LOCK_EXCLUDED_(locks)
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
new file mode 100644
index 0000000000..60c1ea050b
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
@@ -0,0 +1,42 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// This file provides an injection point for custom printers in a local
+// installation of gTest.
+// It will be included from gtest-printers.h and the overrides in this file
+// will be visible to everyone.
+// See documentation at gtest/gtest-printers.h for details on how to define a
+// custom printer.
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h
new file mode 100644
index 0000000000..c27412a898
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h
@@ -0,0 +1,41 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations.
+// The following macros can be defined:
+//
+// GTEST_OS_STACK_TRACE_GETTER_  - The name of an implementation of
+//                                 OsStackTraceGetterInterface.
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
new file mode 100644
index 0000000000..2b3a78f5bf
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
@@ -0,0 +1,319 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines internal utilities needed for implementing
+// death tests.  They are subject to change without notice.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+#include "gtest/internal/gtest-internal.h"
+
+#include <stdio.h>
+
+namespace testing {
+namespace internal {
+
+GTEST_DECLARE_string_(internal_run_death_test);
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kDeathTestStyleFlag[] = "death_test_style";
+const char kDeathTestUseFork[] = "death_test_use_fork";
+const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
+
+#if GTEST_HAS_DEATH_TEST
+
+// DeathTest is a class that hides much of the complexity of the
+// GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
+// returns a concrete class that depends on the prevailing death test
+// style, as defined by the --gtest_death_test_style and/or
+// --gtest_internal_run_death_test flags.
+
+// In describing the results of death tests, these terms are used with
+// the corresponding definitions:
+//
+// exit status:  The integer exit information in the format specified
+//               by wait(2)
+// exit code:    The integer code passed to exit(3), _exit(2), or
+//               returned from main()
+class GTEST_API_ DeathTest {
+ public:
+  // Create returns false if there was an error determining the
+  // appropriate action to take for the current death test; for example,
+  // if the gtest_death_test_style flag is set to an invalid value.
+  // The LastMessage method will return a more detailed message in that
+  // case.  Otherwise, the DeathTest pointer pointed to by the "test"
+  // argument is set.  If the death test should be skipped, the pointer
+  // is set to NULL; otherwise, it is set to the address of a new concrete
+  // DeathTest object that controls the execution of the current test.
+  static bool Create(const char* statement, const RE* regex,
+                     const char* file, int line, DeathTest** test);
+  DeathTest();
+  virtual ~DeathTest() { }
+
+  // A helper class that aborts a death test when it's deleted.
+  class ReturnSentinel {
+   public:
+    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
+    ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+   private:
+    DeathTest* const test_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
+  } GTEST_ATTRIBUTE_UNUSED_;
+
+  // An enumeration of possible roles that may be taken when a death
+  // test is encountered.  EXECUTE means that the death test logic should
+  // be executed immediately.  OVERSEE means that the program should prepare
+  // the appropriate environment for a child process to execute the death
+  // test, then wait for it to complete.
+  enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
+
+  // An enumeration of the three reasons that a test might be aborted.
+  enum AbortReason {
+    TEST_ENCOUNTERED_RETURN_STATEMENT,
+    TEST_THREW_EXCEPTION,
+    TEST_DID_NOT_DIE
+  };
+
+  // Assumes one of the above roles.
+  virtual TestRole AssumeRole() = 0;
+
+  // Waits for the death test to finish and returns its status.
+  virtual int Wait() = 0;
+
+  // Returns true if the death test passed; that is, the test process
+  // exited during the test, its exit status matches a user-supplied
+  // predicate, and its stderr output matches a user-supplied regular
+  // expression.
+  // The user-supplied predicate may be a macro expression rather
+  // than a function pointer or functor, or else Wait and Passed could
+  // be combined.
+  virtual bool Passed(bool exit_status_ok) = 0;
+
+  // Signals that the death test did not die as expected.
+  virtual void Abort(AbortReason reason) = 0;
+
+  // Returns a human-readable outcome message regarding the outcome of
+  // the last death test.
+  static const char* LastMessage();
+
+  static void set_last_death_test_message(const std::string& message);
+
+ private:
+  // A string containing a description of the outcome of the last death test.
+  static std::string last_death_test_message_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
+};
+
+// Factory interface for death tests.  May be mocked out for testing.
+class DeathTestFactory {
+ public:
+  virtual ~DeathTestFactory() { }
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test) = 0;
+};
+
+// A concrete DeathTestFactory implementation for normal use.
+class DefaultDeathTestFactory : public DeathTestFactory {
+ public:
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test);
+};
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
+
+// Traps C++ exceptions escaping statement and reports them as test
+// failures. Note that trapping SEH exceptions is not implemented here.
+# if GTEST_HAS_EXCEPTIONS
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  try { \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } catch (const ::std::exception& gtest_exception) { \
+    fprintf(\
+        stderr, \
+        "\n%s: Caught std::exception-derived exception escaping the " \
+        "death test statement. Exception message: %s\n", \
+        ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
+        gtest_exception.what()); \
+    fflush(stderr); \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  } catch (...) { \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  }
+
+# else
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+
+# endif
+
+// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
+// ASSERT_EXIT*, and EXPECT_EXIT*.
+# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    const ::testing::internal::RE& gtest_regex = (regex); \
+    ::testing::internal::DeathTest* gtest_dt; \
+    if (!::testing::internal::DeathTest::Create(#statement, &gtest_regex, \
+        __FILE__, __LINE__, &gtest_dt)) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+    } \
+    if (gtest_dt != NULL) { \
+      ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \
+          gtest_dt_ptr(gtest_dt); \
+      switch (gtest_dt->AssumeRole()) { \
+        case ::testing::internal::DeathTest::OVERSEE_TEST: \
+          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \
+            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+          } \
+          break; \
+        case ::testing::internal::DeathTest::EXECUTE_TEST: { \
+          ::testing::internal::DeathTest::ReturnSentinel \
+              gtest_sentinel(gtest_dt); \
+          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \
+          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \
+          break; \
+        } \
+        default: \
+          break; \
+      } \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \
+      fail(::testing::internal::DeathTest::LastMessage())
+// The symbol "fail" here expands to something into which a message
+// can be streamed.
+
+// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
+// NDEBUG mode. In this case we need the statements to be executed, the regex is
+// ignored, and the macro must accept a streamed message even though the message
+// is never printed.
+# define GTEST_EXECUTE_STATEMENT_(statement, regex) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+     GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } else \
+    ::testing::Message()
+
+// A class representing the parsed contents of the
+// --gtest_internal_run_death_test flag, as it existed when
+// RUN_ALL_TESTS was called.
+class InternalRunDeathTestFlag {
+ public:
+  InternalRunDeathTestFlag(const std::string& a_file,
+                           int a_line,
+                           int an_index,
+                           int a_write_fd)
+      : file_(a_file), line_(a_line), index_(an_index),
+        write_fd_(a_write_fd) {}
+
+  ~InternalRunDeathTestFlag() {
+    if (write_fd_ >= 0)
+      posix::Close(write_fd_);
+  }
+
+  const std::string& file() const { return file_; }
+  int line() const { return line_; }
+  int index() const { return index_; }
+  int write_fd() const { return write_fd_; }
+
+ private:
+  std::string file_;
+  int line_;
+  int index_;
+  int write_fd_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
+};
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
+
+#else  // GTEST_HAS_DEATH_TEST
+
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
+// systems that support death tests. This allows one to write such a macro
+// on a system that does not support death tests and be sure that it will
+// compile on a death-test supporting system.
+//
+// Parameters:
+//   statement -  A statement that a macro such as EXPECT_DEATH would test
+//                for program termination. This macro has to make sure this
+//                statement is compiled but not executed, to ensure that
+//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+//                parameter iff EXPECT_DEATH compiles with it.
+//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
+//                the output of statement.  This parameter has to be
+//                compiled but not evaluated by this macro, to ensure that
+//                this macro only accepts expressions that a macro such as
+//                EXPECT_DEATH would accept.
+//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+//                compile inside functions where ASSERT_DEATH doesn't
+//                compile.
+//
+//  The branch that has an always false condition is used to ensure that
+//  statement and regex are compiled (and thus syntactically correct) but
+//  never executed. The unreachable code macro protects the terminator
+//  statement from generating an 'unreachable code' warning in case
+//  statement unconditionally returns or throws. The Message constructor at
+//  the end allows the syntax of streaming additional messages into the
+//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::AlwaysTrue()) { \
+      GTEST_LOG_(WARNING) \
+          << "Death tests are not supported on this platform.\n" \
+          << "Statement '" #statement "' cannot be verified."; \
+    } else if (::testing::internal::AlwaysFalse()) { \
+      ::testing::internal::RE::PartialMatch(".*", (regex)); \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+      terminator; \
+    } else \
+      ::testing::Message()
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
new file mode 100644
index 0000000000..7a13b4b0de
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
@@ -0,0 +1,206 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: keith.ray@gmail.com (Keith Ray)
+//
+// Google Test filepath utilities
+//
+// This header file declares classes and functions used internally by
+// Google Test.  They are subject to change without notice.
+//
+// This file is #included in <gtest/internal/gtest-internal.h>.
+// Do not include this header file separately!
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+
+#include "gtest/internal/gtest-string.h"
+
+namespace testing {
+namespace internal {
+
+// FilePath - a class for file and directory pathname manipulation which
+// handles platform-specific conventions (like the pathname separator).
+// Used for helper functions for naming files in a directory for xml output.
+// Except for Set methods, all methods are const or static, which provides an
+// "immutable value object" -- useful for peace of mind.
+// A FilePath with a value ending in a path separator ("like/this/") represents
+// a directory, otherwise it is assumed to represent a file. In either case,
+// it may or may not represent an actual file or directory in the file system.
+// Names are NOT checked for syntax correctness -- no checking for illegal
+// characters, malformed paths, etc.
+
+class GTEST_API_ FilePath {
+ public:
+  FilePath() : pathname_("") { }
+  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
+
+  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
+    Normalize();
+  }
+
+  FilePath& operator=(const FilePath& rhs) {
+    Set(rhs);
+    return *this;
+  }
+
+  void Set(const FilePath& rhs) {
+    pathname_ = rhs.pathname_;
+  }
+
+  const std::string& string() const { return pathname_; }
+  const char* c_str() const { return pathname_.c_str(); }
+
+  // Returns the current working directory, or "" if unsuccessful.
+  static FilePath GetCurrentDir();
+
+  // Given directory = "dir", base_name = "test", number = 0,
+  // extension = "xml", returns "dir/test.xml". If number is greater
+  // than zero (e.g., 12), returns "dir/test_12.xml".
+  // On Windows platform, uses \ as the separator rather than /.
+  static FilePath MakeFileName(const FilePath& directory,
+                               const FilePath& base_name,
+                               int number,
+                               const char* extension);
+
+  // Given directory = "dir", relative_path = "test.xml",
+  // returns "dir/test.xml".
+  // On Windows, uses \ as the separator rather than /.
+  static FilePath ConcatPaths(const FilePath& directory,
+                              const FilePath& relative_path);
+
+  // Returns a pathname for a file that does not currently exist. The pathname
+  // will be directory/base_name.extension or
+  // directory/base_name_<number>.extension if directory/base_name.extension
+  // already exists. The number will be incremented until a pathname is found
+  // that does not already exist.
+  // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+  // There could be a race condition if two or more processes are calling this
+  // function at the same time -- they could both pick the same filename.
+  static FilePath GenerateUniqueFileName(const FilePath& directory,
+                                         const FilePath& base_name,
+                                         const char* extension);
+
+  // Returns true iff the path is "".
+  bool IsEmpty() const { return pathname_.empty(); }
+
+  // If input name has a trailing separator character, removes it and returns
+  // the name, otherwise return the name string unmodified.
+  // On Windows platform, uses \ as the separator, other platforms use /.
+  FilePath RemoveTrailingPathSeparator() const;
+
+  // Returns a copy of the FilePath with the directory part removed.
+  // Example: FilePath("path/to/file").RemoveDirectoryName() returns
+  // FilePath("file"). If there is no directory part ("just_a_file"), it returns
+  // the FilePath unmodified. If there is no file part ("just_a_dir/") it
+  // returns an empty FilePath ("").
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveDirectoryName() const;
+
+  // RemoveFileName returns the directory path with the filename removed.
+  // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+  // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+  // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+  // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveFileName() const;
+
+  // Returns a copy of the FilePath with the case-insensitive extension removed.
+  // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+  // FilePath("dir/file"). If a case-insensitive extension is not
+  // found, returns a copy of the original FilePath.
+  FilePath RemoveExtension(const char* extension) const;
+
+  // Creates directories so that path exists. Returns true if successful or if
+  // the directories already exist; returns false if unable to create
+  // directories for any reason. Will also return false if the FilePath does
+  // not represent a directory (that is, it doesn't end with a path separator).
+  bool CreateDirectoriesRecursively() const;
+
+  // Create the directory so that path exists. Returns true if successful or
+  // if the directory already exists; returns false if unable to create the
+  // directory for any reason, including if the parent directory does not
+  // exist. Not named "CreateDirectory" because that's a macro on Windows.
+  bool CreateFolder() const;
+
+  // Returns true if FilePath describes something in the file-system,
+  // either a file, directory, or whatever, and that something exists.
+  bool FileOrDirectoryExists() const;
+
+  // Returns true if pathname describes a directory in the file-system
+  // that exists.
+  bool DirectoryExists() const;
+
+  // Returns true if FilePath ends with a path separator, which indicates that
+  // it is intended to represent a directory. Returns false otherwise.
+  // This does NOT check that a directory (or file) actually exists.
+  bool IsDirectory() const;
+
+  // Returns true if pathname describes a root directory. (Windows has one
+  // root directory per disk drive.)
+  bool IsRootDirectory() const;
+
+  // Returns true if pathname describes an absolute path.
+  bool IsAbsolutePath() const;
+
+ private:
+  // Replaces multiple consecutive separators with a single separator.
+  // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+  // redundancies that might be in a pathname involving "." or "..".
+  //
+  // A pathname with multiple consecutive separators may occur either through
+  // user error or as a result of some scripts or APIs that generate a pathname
+  // with a trailing separator. On other platforms the same API or script
+  // may NOT generate a pathname with a trailing "/". Then elsewhere that
+  // pathname may have another "/" and pathname components added to it,
+  // without checking for the separator already being there.
+  // The script language and operating system may allow paths like "foo//bar"
+  // but some of the functions in FilePath will not handle that correctly. In
+  // particular, RemoveTrailingPathSeparator() only removes one separator, and
+  // it is called in CreateDirectoriesRecursively() assuming that it will change
+  // a pathname from directory syntax (trailing separator) to filename syntax.
+  //
+  // On Windows this method also replaces the alternate path separator '/' with
+  // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
+  // "bar\\foo".
+
+  void Normalize();
+
+  // Returns a pointer to the last occurence of a valid path separator in
+  // the FilePath. On Windows, for example, both '/' and '\' are valid path
+  // separators. Returns NULL if no path separator was found.
+  const char* FindLastPathSeparator() const;
+
+  std::string pathname_;
+};  // class FilePath
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h
new file mode 100644
index 0000000000..ebd1cf615d
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h
@@ -0,0 +1,1238 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file declares functions and macros used internally by
+// Google Test.  They are subject to change without notice.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_OS_LINUX
+# include <stdlib.h>
+# include <sys/types.h>
+# include <sys/wait.h>
+# include <unistd.h>
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#include <ctype.h>
+#include <float.h>
+#include <string.h>
+#include <iomanip>
+#include <limits>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-string.h"
+#include "gtest/internal/gtest-filepath.h"
+#include "gtest/internal/gtest-type-util.h"
+
+// Due to C++ preprocessor weirdness, we need double indirection to
+// concatenate two tokens when one of them is __LINE__.  Writing
+//
+//   foo ## __LINE__
+//
+// will result in the token foo__LINE__, instead of foo followed by
+// the current line number.  For more details, see
+// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
+#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
+
+class ProtocolMessage;
+namespace proto2 { class Message; }
+
+namespace testing {
+
+// Forward declarations.
+
+class AssertionResult;                 // Result of an assertion.
+class Message;                         // Represents a failure message.
+class Test;                            // Represents a test.
+class TestInfo;                        // Information about a test.
+class TestPartResult;                  // Result of a test part.
+class UnitTest;                        // A collection of test cases.
+
+template <typename T>
+::std::string PrintToString(const T& value);
+
+namespace internal {
+
+struct TraceInfo;                      // Information about a trace point.
+class ScopedTrace;                     // Implements scoped trace.
+class TestInfoImpl;                    // Opaque implementation of TestInfo
+class UnitTestImpl;                    // Opaque implementation of UnitTest
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+GTEST_API_ extern const char kStackTraceMarker[];
+
+// Two overloaded helpers for checking at compile time whether an
+// expression is a null pointer literal (i.e. NULL or any 0-valued
+// compile-time integral constant).  Their return values have
+// different sizes, so we can use sizeof() to test which version is
+// picked by the compiler.  These helpers have no implementations, as
+// we only need their signatures.
+//
+// Given IsNullLiteralHelper(x), the compiler will pick the first
+// version if x can be implicitly converted to Secret*, and pick the
+// second version otherwise.  Since Secret is a secret and incomplete
+// type, the only expression a user can write that has type Secret* is
+// a null pointer literal.  Therefore, we know that x is a null
+// pointer literal if and only if the first version is picked by the
+// compiler.
+char IsNullLiteralHelper(Secret* p);
+char (&IsNullLiteralHelper(...))[2];  // NOLINT
+
+// A compile-time bool constant that is true if and only if x is a
+// null pointer literal (i.e. NULL or any 0-valued compile-time
+// integral constant).
+#ifdef GTEST_ELLIPSIS_NEEDS_POD_
+// We lose support for NULL detection where the compiler doesn't like
+// passing non-POD classes through ellipsis (...).
+# define GTEST_IS_NULL_LITERAL_(x) false
+#else
+# define GTEST_IS_NULL_LITERAL_(x) \
+    (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1)
+#endif  // GTEST_ELLIPSIS_NEEDS_POD_
+
+// Appends the user-supplied message to the Google-Test-generated message.
+GTEST_API_ std::string AppendUserMessage(
+    const std::string& gtest_msg, const Message& user_msg);
+
+#if GTEST_HAS_EXCEPTIONS
+
+// This exception is thrown by (and only by) a failed Google Test
+// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
+// are enabled).  We derive it from std::runtime_error, which is for
+// errors presumably detectable only at run time.  Since
+// std::runtime_error inherits from std::exception, many testing
+// frameworks know how to extract and print the message inside it.
+class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
+ public:
+  explicit GoogleTestFailureException(const TestPartResult& failure);
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// A helper class for creating scoped traces in user programs.
+class GTEST_API_ ScopedTrace {
+ public:
+  // The c'tor pushes the given source file location and message onto
+  // a trace stack maintained by Google Test.
+  ScopedTrace(const char* file, int line, const Message& message);
+
+  // The d'tor pops the info pushed by the c'tor.
+  //
+  // Note that the d'tor is not virtual in order to be efficient.
+  // Don't inherit from ScopedTrace!
+  ~ScopedTrace();
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
+                            // c'tor and d'tor.  Therefore it doesn't
+                            // need to be used otherwise.
+
+namespace edit_distance {
+// Returns the optimal edits to go from 'left' to 'right'.
+// All edits cost the same, with replace having lower priority than
+// add/remove.
+// Simple implementation of the Wagner–Fischer algorithm.
+// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm
+enum EditType { kMatch, kAdd, kRemove, kReplace };
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<size_t>& left, const std::vector<size_t>& right);
+
+// Same as above, but the input is represented as strings.
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right);
+
+// Create a diff of the input strings in Unified diff format.
+GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                                         const std::vector<std::string>& right,
+                                         size_t context = 2);
+
+}  // namespace edit_distance
+
+// Calculate the diff between 'left' and 'right' and return it in unified diff
+// format.
+// If not null, stores in 'total_line_count' the total number of lines found
+// in left + right.
+GTEST_API_ std::string DiffStrings(const std::string& left,
+                                   const std::string& right,
+                                   size_t* total_line_count);
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
+                                     const char* actual_expression,
+                                     const std::string& expected_value,
+                                     const std::string& actual_value,
+                                     bool ignoring_case);
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+GTEST_API_ std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value);
+
+// This template class represents an IEEE floating-point number
+// (either single-precision or double-precision, depending on the
+// template parameters).
+//
+// The purpose of this class is to do more sophisticated number
+// comparison.  (Due to round-off error, etc, it's very unlikely that
+// two floating-points will be equal exactly.  Hence a naive
+// comparison by the == operation often doesn't work.)
+//
+// Format of IEEE floating-point:
+//
+//   The most-significant bit being the leftmost, an IEEE
+//   floating-point looks like
+//
+//     sign_bit exponent_bits fraction_bits
+//
+//   Here, sign_bit is a single bit that designates the sign of the
+//   number.
+//
+//   For float, there are 8 exponent bits and 23 fraction bits.
+//
+//   For double, there are 11 exponent bits and 52 fraction bits.
+//
+//   More details can be found at
+//   http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+template <typename RawType>
+class FloatingPoint {
+ public:
+  // Defines the unsigned integer type that has the same size as the
+  // floating point number.
+  typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
+
+  // Constants.
+
+  // # of bits in a number.
+  static const size_t kBitCount = 8*sizeof(RawType);
+
+  // # of fraction bits in a number.
+  static const size_t kFractionBitCount =
+    std::numeric_limits<RawType>::digits - 1;
+
+  // # of exponent bits in a number.
+  static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
+
+  // The mask for the sign bit.
+  static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
+
+  // The mask for the fraction bits.
+  static const Bits kFractionBitMask =
+    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+
+  // The mask for the exponent bits.
+  static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
+
+  // How many ULP's (Units in the Last Place) we want to tolerate when
+  // comparing two numbers.  The larger the value, the more error we
+  // allow.  A 0 value means that two numbers must be exactly the same
+  // to be considered equal.
+  //
+  // The maximum error of a single floating-point operation is 0.5
+  // units in the last place.  On Intel CPU's, all floating-point
+  // calculations are done with 80-bit precision, while double has 64
+  // bits.  Therefore, 4 should be enough for ordinary use.
+  //
+  // See the following article for more details on ULP:
+  // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+  static const size_t kMaxUlps = 4;
+
+  // Constructs a FloatingPoint from a raw floating-point number.
+  //
+  // On an Intel CPU, passing a non-normalized NAN (Not a Number)
+  // around may change its bits, although the new value is guaranteed
+  // to be also a NAN.  Therefore, don't expect this constructor to
+  // preserve the bits in x when x is a NAN.
+  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
+
+  // Static methods
+
+  // Reinterprets a bit pattern as a floating-point number.
+  //
+  // This function is needed to test the AlmostEquals() method.
+  static RawType ReinterpretBits(const Bits bits) {
+    FloatingPoint fp(0);
+    fp.u_.bits_ = bits;
+    return fp.u_.value_;
+  }
+
+  // Returns the floating-point number that represent positive infinity.
+  static RawType Infinity() {
+    return ReinterpretBits(kExponentBitMask);
+  }
+
+  // Returns the maximum representable finite floating-point number.
+  static RawType Max();
+
+  // Non-static methods
+
+  // Returns the bits that represents this number.
+  const Bits &bits() const { return u_.bits_; }
+
+  // Returns the exponent bits of this number.
+  Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
+
+  // Returns the fraction bits of this number.
+  Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
+
+  // Returns the sign bit of this number.
+  Bits sign_bit() const { return kSignBitMask & u_.bits_; }
+
+  // Returns true iff this is NAN (not a number).
+  bool is_nan() const {
+    // It's a NAN if the exponent bits are all ones and the fraction
+    // bits are not entirely zeros.
+    return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
+  }
+
+  // Returns true iff this number is at most kMaxUlps ULP's away from
+  // rhs.  In particular, this function:
+  //
+  //   - returns false if either number is (or both are) NAN.
+  //   - treats really large numbers as almost equal to infinity.
+  //   - thinks +0.0 and -0.0 are 0 DLP's apart.
+  bool AlmostEquals(const FloatingPoint& rhs) const {
+    // The IEEE standard says that any comparison operation involving
+    // a NAN must return false.
+    if (is_nan() || rhs.is_nan()) return false;
+
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
+        <= kMaxUlps;
+  }
+
+ private:
+  // The data type used to store the actual floating-point number.
+  union FloatingPointUnion {
+    RawType value_;  // The raw floating-point number.
+    Bits bits_;      // The bits that represent the number.
+  };
+
+  // Converts an integer from the sign-and-magnitude representation to
+  // the biased representation.  More precisely, let N be 2 to the
+  // power of (kBitCount - 1), an integer x is represented by the
+  // unsigned number x + N.
+  //
+  // For instance,
+  //
+  //   -N + 1 (the most negative number representable using
+  //          sign-and-magnitude) is represented by 1;
+  //   0      is represented by N; and
+  //   N - 1  (the biggest number representable using
+  //          sign-and-magnitude) is represented by 2N - 1.
+  //
+  // Read http://en.wikipedia.org/wiki/Signed_number_representations
+  // for more details on signed number representations.
+  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+    if (kSignBitMask & sam) {
+      // sam represents a negative number.
+      return ~sam + 1;
+    } else {
+      // sam represents a positive number.
+      return kSignBitMask | sam;
+    }
+  }
+
+  // Given two numbers in the sign-and-magnitude representation,
+  // returns the distance between them as an unsigned number.
+  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
+                                                     const Bits &sam2) {
+    const Bits biased1 = SignAndMagnitudeToBiased(sam1);
+    const Bits biased2 = SignAndMagnitudeToBiased(sam2);
+    return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
+  }
+
+  FloatingPointUnion u_;
+};
+
+// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
+// macro defined by <windows.h>.
+template <>
+inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+template <>
+inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+
+// Typedefs the instances of the FloatingPoint template class that we
+// care to use.
+typedef FloatingPoint<float> Float;
+typedef FloatingPoint<double> Double;
+
+// In order to catch the mistake of putting tests that use different
+// test fixture classes in the same test case, we need to assign
+// unique IDs to fixture classes and compare them.  The TypeId type is
+// used to hold such IDs.  The user should treat TypeId as an opaque
+// type: the only operation allowed on TypeId values is to compare
+// them for equality using the == operator.
+typedef const void* TypeId;
+
+template <typename T>
+class TypeIdHelper {
+ public:
+  // dummy_ must not have a const type.  Otherwise an overly eager
+  // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
+  // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
+  static bool dummy_;
+};
+
+template <typename T>
+bool TypeIdHelper<T>::dummy_ = false;
+
+// GetTypeId<T>() returns the ID of type T.  Different values will be
+// returned for different types.  Calling the function twice with the
+// same type argument is guaranteed to return the same ID.
+template <typename T>
+TypeId GetTypeId() {
+  // The compiler is required to allocate a different
+  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
+  // the template.  Therefore, the address of dummy_ is guaranteed to
+  // be unique.
+  return &(TypeIdHelper<T>::dummy_);
+}
+
+// Returns the type ID of ::testing::Test.  Always call this instead
+// of GetTypeId< ::testing::Test>() to get the type ID of
+// ::testing::Test, as the latter may give the wrong result due to a
+// suspected linker bug when compiling Google Test as a Mac OS X
+// framework.
+GTEST_API_ TypeId GetTestTypeId();
+
+// Defines the abstract factory interface that creates instances
+// of a Test object.
+class TestFactoryBase {
+ public:
+  virtual ~TestFactoryBase() {}
+
+  // Creates a test instance to run. The instance is both created and destroyed
+  // within TestInfoImpl::Run()
+  virtual Test* CreateTest() = 0;
+
+ protected:
+  TestFactoryBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
+};
+
+// This class provides implementation of TeastFactoryBase interface.
+// It is used in TEST and TEST_F macros.
+template <class TestClass>
+class TestFactoryImpl : public TestFactoryBase {
+ public:
+  virtual Test* CreateTest() { return new TestClass; }
+};
+
+#if GTEST_OS_WINDOWS
+
+// Predicate-formatters for implementing the HRESULT checking macros
+// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
+// We pass a long instead of HRESULT to avoid causing an
+// include dependency for the HRESULT type.
+GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
+                                            long hr);  // NOLINT
+GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
+                                            long hr);  // NOLINT
+
+#endif  // GTEST_OS_WINDOWS
+
+// Types of SetUpTestCase() and TearDownTestCase() functions.
+typedef void (*SetUpTestCaseFunc)();
+typedef void (*TearDownTestCaseFunc)();
+
+struct CodeLocation {
+  CodeLocation(const string& a_file, int a_line) : file(a_file), line(a_line) {}
+
+  string file;
+  int line;
+};
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param        the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param       text representation of the test's value parameter,
+//                     or NULL if this is not a type-parameterized test.
+//   code_location:    code location where the test is defined
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name,
+    const char* name,
+    const char* type_param,
+    const char* value_param,
+    CodeLocation code_location,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory);
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// State of the definition of a type-parameterized test case.
+class GTEST_API_ TypedTestCasePState {
+ public:
+  TypedTestCasePState() : registered_(false) {}
+
+  // Adds the given test name to defined_test_names_ and return true
+  // if the test case hasn't been registered; otherwise aborts the
+  // program.
+  bool AddTestName(const char* file, int line, const char* case_name,
+                   const char* test_name) {
+    if (registered_) {
+      fprintf(stderr, "%s Test %s must be defined before "
+              "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n",
+              FormatFileLocation(file, line).c_str(), test_name, case_name);
+      fflush(stderr);
+      posix::Abort();
+    }
+    registered_tests_.insert(
+        ::std::make_pair(test_name, CodeLocation(file, line)));
+    return true;
+  }
+
+  bool TestExists(const std::string& test_name) const {
+    return registered_tests_.count(test_name) > 0;
+  }
+
+  const CodeLocation& GetCodeLocation(const std::string& test_name) const {
+    RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name);
+    GTEST_CHECK_(it != registered_tests_.end());
+    return it->second;
+  }
+
+  // Verifies that registered_tests match the test names in
+  // defined_test_names_; returns registered_tests if successful, or
+  // aborts the program otherwise.
+  const char* VerifyRegisteredTestNames(
+      const char* file, int line, const char* registered_tests);
+
+ private:
+  typedef ::std::map<std::string, CodeLocation> RegisteredTestsMap;
+
+  bool registered_;
+  RegisteredTestsMap registered_tests_;
+};
+
+// Skips to the first non-space char after the first comma in 'str';
+// returns NULL if no comma is found in 'str'.
+inline const char* SkipComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  if (comma == NULL) {
+    return NULL;
+  }
+  while (IsSpace(*(++comma))) {}
+  return comma;
+}
+
+// Returns the prefix of 'str' before the first comma in it; returns
+// the entire string if it contains no comma.
+inline std::string GetPrefixUntilComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  return comma == NULL ? str : std::string(str, comma);
+}
+
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest);
+
+// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
+// registers a list of type-parameterized tests with Google Test.  The
+// return value is insignificant - we just need to return something
+// such that we can call this function in a namespace scope.
+//
+// Implementation note: The GTEST_TEMPLATE_ macro declares a template
+// template parameter.  It's defined in gtest-type-util.h.
+template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
+class TypeParameterizedTest {
+ public:
+  // 'index' is the index of the test in the type list 'Types'
+  // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
+  // Types).  Valid values for 'index' are [0, N - 1] where N is the
+  // length of Types.
+  static bool Register(const char* prefix,
+                       CodeLocation code_location,
+                       const char* case_name, const char* test_names,
+                       int index) {
+    typedef typename Types::Head Type;
+    typedef Fixture<Type> FixtureClass;
+    typedef typename GTEST_BIND_(TestSel, Type) TestClass;
+
+    // First, registers the first type-parameterized test in the type
+    // list.
+    MakeAndRegisterTestInfo(
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/"
+         + StreamableToString(index)).c_str(),
+        StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(),
+        GetTypeName<Type>().c_str(),
+        NULL,  // No value parameter.
+        code_location,
+        GetTypeId<FixtureClass>(),
+        TestClass::SetUpTestCase,
+        TestClass::TearDownTestCase,
+        new TestFactoryImpl<TestClass>);
+
+    // Next, recurses (at compile time) with the tail of the type list.
+    return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail>
+        ::Register(prefix, code_location, case_name, test_names, index + 1);
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, class TestSel>
+class TypeParameterizedTest<Fixture, TestSel, Types0> {
+ public:
+  static bool Register(const char* /*prefix*/, CodeLocation,
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       int /*index*/) {
+    return true;
+  }
+};
+
+// TypeParameterizedTestCase<Fixture, Tests, Types>::Register()
+// registers *all combinations* of 'Tests' and 'Types' with Google
+// Test.  The return value is insignificant - we just need to return
+// something such that we can call this function in a namespace scope.
+template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
+class TypeParameterizedTestCase {
+ public:
+  static bool Register(const char* prefix, CodeLocation code_location,
+                       const TypedTestCasePState* state,
+                       const char* case_name, const char* test_names) {
+    std::string test_name = StripTrailingSpaces(
+        GetPrefixUntilComma(test_names));
+    if (!state->TestExists(test_name)) {
+      fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
+              case_name, test_name.c_str(),
+              FormatFileLocation(code_location.file.c_str(),
+                                 code_location.line).c_str());
+      fflush(stderr);
+      posix::Abort();
+    }
+    const CodeLocation& test_location = state->GetCodeLocation(test_name);
+
+    typedef typename Tests::Head Head;
+
+    // First, register the first test in 'Test' for each type in 'Types'.
+    TypeParameterizedTest<Fixture, Head, Types>::Register(
+        prefix, test_location, case_name, test_names, 0);
+
+    // Next, recurses (at compile time) with the tail of the test list.
+    return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types>
+        ::Register(prefix, code_location, state,
+                   case_name, SkipComma(test_names));
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, typename Types>
+class TypeParameterizedTestCase<Fixture, Templates0, Types> {
+ public:
+  static bool Register(const char* /*prefix*/, CodeLocation,
+                       const TypedTestCasePState* /*state*/,
+                       const char* /*case_name*/, const char* /*test_names*/) {
+    return true;
+  }
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
+    UnitTest* unit_test, int skip_count);
+
+// Helpers for suppressing warnings on unreachable code or constant
+// condition.
+
+// Always returns true.
+GTEST_API_ bool AlwaysTrue();
+
+// Always returns false.
+inline bool AlwaysFalse() { return !AlwaysTrue(); }
+
+// Helper for suppressing false warning from Clang on a const char*
+// variable declared in a conditional expression always being NULL in
+// the else branch.
+struct GTEST_API_ ConstCharPtr {
+  ConstCharPtr(const char* str) : value(str) {}
+  operator bool() const { return true; }
+  const char* value;
+};
+
+// A simple Linear Congruential Generator for generating random
+// numbers with a uniform distribution.  Unlike rand() and srand(), it
+// doesn't use global state (and therefore can't interfere with user
+// code).  Unlike rand_r(), it's portable.  An LCG isn't very random,
+// but it's good enough for our purposes.
+class GTEST_API_ Random {
+ public:
+  static const UInt32 kMaxRange = 1u << 31;
+
+  explicit Random(UInt32 seed) : state_(seed) {}
+
+  void Reseed(UInt32 seed) { state_ = seed; }
+
+  // Generates a random number from [0, range).  Crashes if 'range' is
+  // 0 or greater than kMaxRange.
+  UInt32 Generate(UInt32 range);
+
+ private:
+  UInt32 state_;
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
+};
+
+// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a
+// compiler error iff T1 and T2 are different types.
+template <typename T1, typename T2>
+struct CompileAssertTypesEqual;
+
+template <typename T>
+struct CompileAssertTypesEqual<T, T> {
+};
+
+// Removes the reference from a type if it is a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::remove_reference, which is not widely available yet.
+template <typename T>
+struct RemoveReference { typedef T type; };  // NOLINT
+template <typename T>
+struct RemoveReference<T&> { typedef T type; };  // NOLINT
+
+// A handy wrapper around RemoveReference that works when the argument
+// T depends on template parameters.
+#define GTEST_REMOVE_REFERENCE_(T) \
+    typename ::testing::internal::RemoveReference<T>::type
+
+// Removes const from a type if it is a const type, otherwise leaves
+// it unchanged.  This is the same as tr1::remove_const, which is not
+// widely available yet.
+template <typename T>
+struct RemoveConst { typedef T type; };  // NOLINT
+template <typename T>
+struct RemoveConst<const T> { typedef T type; };  // NOLINT
+
+// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above
+// definition to fail to remove the const in 'const int[3]' and 'const
+// char[3][4]'.  The following specialization works around the bug.
+template <typename T, size_t N>
+struct RemoveConst<const T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
+
+#if defined(_MSC_VER) && _MSC_VER < 1400
+// This is the only specialization that allows VC++ 7.1 to remove const in
+// 'const int[3] and 'const int[3][4]'.  However, it causes trouble with GCC
+// and thus needs to be conditionally compiled.
+template <typename T, size_t N>
+struct RemoveConst<T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
+#endif
+
+// A handy wrapper around RemoveConst that works when the argument
+// T depends on template parameters.
+#define GTEST_REMOVE_CONST_(T) \
+    typename ::testing::internal::RemoveConst<T>::type
+
+// Turns const U&, U&, const U, and U all into U.
+#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
+    GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
+
+// Adds reference to a type if it is not a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::add_reference, which is not widely available yet.
+template <typename T>
+struct AddReference { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddReference<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper around AddReference that works when the argument T
+// depends on template parameters.
+#define GTEST_ADD_REFERENCE_(T) \
+    typename ::testing::internal::AddReference<T>::type
+
+// Adds a reference to const on top of T as necessary.  For example,
+// it transforms
+//
+//   char         ==> const char&
+//   const char   ==> const char&
+//   char&        ==> const char&
+//   const char&  ==> const char&
+//
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+    GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T))
+
+// ImplicitlyConvertible<From, To>::value is a compile-time bool
+// constant that's true iff type From can be implicitly converted to
+// type To.
+template <typename From, typename To>
+class ImplicitlyConvertible {
+ private:
+  // We need the following helper functions only for their types.
+  // They have no implementations.
+
+  // MakeFrom() is an expression whose type is From.  We cannot simply
+  // use From(), as the type From may not have a public default
+  // constructor.
+  static typename AddReference<From>::type MakeFrom();
+
+  // These two functions are overloaded.  Given an expression
+  // Helper(x), the compiler will pick the first version if x can be
+  // implicitly converted to type To; otherwise it will pick the
+  // second version.
+  //
+  // The first version returns a value of size 1, and the second
+  // version returns a value of size 2.  Therefore, by checking the
+  // size of Helper(x), which can be done at compile time, we can tell
+  // which version of Helper() is used, and hence whether x can be
+  // implicitly converted to type To.
+  static char Helper(To);
+  static char (&Helper(...))[2];  // NOLINT
+
+  // We have to put the 'public' section after the 'private' section,
+  // or MSVC refuses to compile the code.
+ public:
+#if defined(__BORLANDC__)
+  // C++Builder cannot use member overload resolution during template
+  // instantiation.  The simplest workaround is to use its C++0x type traits
+  // functions (C++Builder 2009 and above only).
+  static const bool value = __is_convertible(From, To);
+#else
+  // MSVC warns about implicitly converting from double to int for
+  // possible loss of data, so we need to temporarily disable the
+  // warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4244)
+  static const bool value =
+      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif  // __BORLANDC__
+};
+template <typename From, typename To>
+const bool ImplicitlyConvertible<From, To>::value;
+
+// IsAProtocolMessage<T>::value is a compile-time bool constant that's
+// true iff T is type ProtocolMessage, proto2::Message, or a subclass
+// of those.
+template <typename T>
+struct IsAProtocolMessage
+    : public bool_constant<
+  ImplicitlyConvertible<const T*, const ::ProtocolMessage*>::value ||
+  ImplicitlyConvertible<const T*, const ::proto2::Message*>::value> {
+};
+
+// When the compiler sees expression IsContainerTest<C>(0), if C is an
+// STL-style container class, the first overload of IsContainerTest
+// will be viable (since both C::iterator* and C::const_iterator* are
+// valid types and NULL can be implicitly converted to them).  It will
+// be picked over the second overload as 'int' is a perfect match for
+// the type of argument 0.  If C::iterator or C::const_iterator is not
+// a valid type, the first overload is not viable, and the second
+// overload will be picked.  Therefore, we can determine whether C is
+// a container class by checking the type of IsContainerTest<C>(0).
+// The value of the expression is insignificant.
+//
+// Note that we look for both C::iterator and C::const_iterator.  The
+// reason is that C++ injects the name of a class as a member of the
+// class itself (e.g. you can refer to class iterator as either
+// 'iterator' or 'iterator::iterator').  If we look for C::iterator
+// only, for example, we would mistakenly think that a class named
+// iterator is an STL container.
+//
+// Also note that the simpler approach of overloading
+// IsContainerTest(typename C::const_iterator*) and
+// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
+typedef int IsContainer;
+template <class C>
+IsContainer IsContainerTest(int /* dummy */,
+                            typename C::iterator* /* it */ = NULL,
+                            typename C::const_iterator* /* const_it */ = NULL) {
+  return 0;
+}
+
+typedef char IsNotContainer;
+template <class C>
+IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
+
+// EnableIf<condition>::type is void when 'Cond' is true, and
+// undefined when 'Cond' is false.  To use SFINAE to make a function
+// overload only apply when a particular expression is true, add
+// "typename EnableIf<expression>::type* = 0" as the last parameter.
+template<bool> struct EnableIf;
+template<> struct EnableIf<true> { typedef void type; };  // NOLINT
+
+// Utilities for native arrays.
+
+// ArrayEq() compares two k-dimensional native arrays using the
+// elements' operator==, where k can be any integer >= 0.  When k is
+// 0, ArrayEq() degenerates into comparing a single pair of values.
+
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
+  return internal::ArrayEq(lhs, N, rhs);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous ArrayEq() function, arrays with different sizes would
+// lead to different copies of the template code.
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
+  for (size_t i = 0; i != size; i++) {
+    if (!internal::ArrayEq(lhs[i], rhs[i]))
+      return false;
+  }
+  return true;
+}
+
+// Finds the first element in the iterator range [begin, end) that
+// equals elem.  Element may be a native array type itself.
+template <typename Iter, typename Element>
+Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
+  for (Iter it = begin; it != end; ++it) {
+    if (internal::ArrayEq(*it, elem))
+      return it;
+  }
+  return end;
+}
+
+// CopyArray() copies a k-dimensional native array using the elements'
+// operator=, where k can be any integer >= 0.  When k is 0,
+// CopyArray() degenerates into copying a single value.
+
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline void CopyArray(const T& from, U* to) { *to = from; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline void CopyArray(const T(&from)[N], U(*to)[N]) {
+  internal::CopyArray(from, N, *to);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous CopyArray() function, arrays with different sizes
+// would lead to different copies of the template code.
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to) {
+  for (size_t i = 0; i != size; i++) {
+    internal::CopyArray(from[i], to + i);
+  }
+}
+
+// The relation between an NativeArray object (see below) and the
+// native array it represents.
+// We use 2 different structs to allow non-copyable types to be used, as long
+// as RelationToSourceReference() is passed.
+struct RelationToSourceReference {};
+struct RelationToSourceCopy {};
+
+// Adapts a native array to a read-only STL-style container.  Instead
+// of the complete STL container concept, this adaptor only implements
+// members useful for Google Mock's container matchers.  New members
+// should be added as needed.  To simplify the implementation, we only
+// support Element being a raw type (i.e. having no top-level const or
+// reference modifier).  It's the client's responsibility to satisfy
+// this requirement.  Element can be an array type itself (hence
+// multi-dimensional arrays are supported).
+template <typename Element>
+class NativeArray {
+ public:
+  // STL-style container typedefs.
+  typedef Element value_type;
+  typedef Element* iterator;
+  typedef const Element* const_iterator;
+
+  // Constructs from a native array. References the source.
+  NativeArray(const Element* array, size_t count, RelationToSourceReference) {
+    InitRef(array, count);
+  }
+
+  // Constructs from a native array. Copies the source.
+  NativeArray(const Element* array, size_t count, RelationToSourceCopy) {
+    InitCopy(array, count);
+  }
+
+  // Copy constructor.
+  NativeArray(const NativeArray& rhs) {
+    (this->*rhs.clone_)(rhs.array_, rhs.size_);
+  }
+
+  ~NativeArray() {
+    if (clone_ != &NativeArray::InitRef)
+      delete[] array_;
+  }
+
+  // STL-style container methods.
+  size_t size() const { return size_; }
+  const_iterator begin() const { return array_; }
+  const_iterator end() const { return array_ + size_; }
+  bool operator==(const NativeArray& rhs) const {
+    return size() == rhs.size() &&
+        ArrayEq(begin(), size(), rhs.begin());
+  }
+
+ private:
+  enum {
+    kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper<
+        Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value,
+  };
+
+  // Initializes this object with a copy of the input.
+  void InitCopy(const Element* array, size_t a_size) {
+    Element* const copy = new Element[a_size];
+    CopyArray(array, a_size, copy);
+    array_ = copy;
+    size_ = a_size;
+    clone_ = &NativeArray::InitCopy;
+  }
+
+  // Initializes this object with a reference of the input.
+  void InitRef(const Element* array, size_t a_size) {
+    array_ = array;
+    size_ = a_size;
+    clone_ = &NativeArray::InitRef;
+  }
+
+  const Element* array_;
+  size_t size_;
+  void (NativeArray::*clone_)(const Element*, size_t);
+
+  GTEST_DISALLOW_ASSIGN_(NativeArray);
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
+  ::testing::internal::AssertHelper(result_type, file, line, message) \
+    = ::testing::Message()
+
+#define GTEST_MESSAGE_(message, result_type) \
+  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
+
+#define GTEST_FATAL_FAILURE_(message) \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
+
+#define GTEST_NONFATAL_FAILURE_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
+
+#define GTEST_SUCCESS_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
+
+// Suppresses MSVC warnings 4072 (unreachable code) for the code following
+// statement if it returns or throws (or doesn't return or throw in some
+// situations).
+#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
+  if (::testing::internal::AlwaysTrue()) { statement; }
+
+#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
+    bool gtest_caught_expected = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (expected_exception const&) { \
+      gtest_caught_expected = true; \
+    } \
+    catch (...) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws a different type."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+    if (!gtest_caught_expected) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws nothing."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
+      fail(gtest_msg.value)
+
+#define GTEST_TEST_NO_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
+      fail("Expected: " #statement " doesn't throw an exception.\n" \
+           "  Actual: it throws.")
+
+#define GTEST_TEST_ANY_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    bool gtest_caught_any = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      gtest_caught_any = true; \
+    } \
+    if (!gtest_caught_any) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
+      fail("Expected: " #statement " throws an exception.\n" \
+           "  Actual: it doesn't.")
+
+
+// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
+// either a boolean expression or an AssertionResult. text is a textual
+// represenation of expression as it was passed into the EXPECT_TRUE.
+#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar_ = \
+      ::testing::AssertionResult(expression)) \
+    ; \
+  else \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(\
+        gtest_ar_, text, #actual, #expected).c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
+      fail("Expected: " #statement " doesn't generate new fatal " \
+           "failures in the current thread.\n" \
+           "  Actual: it does.")
+
+// Expands to the name of the class that implements the given test.
+#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+  test_case_name##_##test_name##_Test
+
+// Helper macro for defining tests.
+#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\
+class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\
+ public:\
+  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
+ private:\
+  virtual void TestBody();\
+  static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\
+};\
+\
+::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
+  ::test_info_ =\
+    ::testing::internal::MakeAndRegisterTestInfo(\
+        #test_case_name, #test_name, NULL, NULL, \
+        ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+        (parent_id), \
+        parent_class::SetUpTestCase, \
+        parent_class::TearDownTestCase, \
+        new ::testing::internal::TestFactoryImpl<\
+            GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
+void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h
new file mode 100644
index 0000000000..3602942217
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h
@@ -0,0 +1,243 @@
+// Copyright 2003 Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Dan Egnor (egnor@google.com)
+//
+// A "smart" pointer type with reference tracking.  Every pointer to a
+// particular object is kept on a circular linked list.  When the last pointer
+// to an object is destroyed or reassigned, the object is deleted.
+//
+// Used properly, this deletes the object when the last reference goes away.
+// There are several caveats:
+// - Like all reference counting schemes, cycles lead to leaks.
+// - Each smart pointer is actually two pointers (8 bytes instead of 4).
+// - Every time a pointer is assigned, the entire list of pointers to that
+//   object is traversed.  This class is therefore NOT SUITABLE when there
+//   will often be more than two or three pointers to a particular object.
+// - References are only tracked as long as linked_ptr<> objects are copied.
+//   If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS
+//   will happen (double deletion).
+//
+// A good use of this class is storing object references in STL containers.
+// You can safely put linked_ptr<> in a vector<>.
+// Other uses may not be as good.
+//
+// Note: If you use an incomplete type with linked_ptr<>, the class
+// *containing* linked_ptr<> must have a constructor and destructor (even
+// if they do nothing!).
+//
+// Bill Gibbons suggested we use something like this.
+//
+// Thread Safety:
+//   Unlike other linked_ptr implementations, in this implementation
+//   a linked_ptr object is thread-safe in the sense that:
+//     - it's safe to copy linked_ptr objects concurrently,
+//     - it's safe to copy *from* a linked_ptr and read its underlying
+//       raw pointer (e.g. via get()) concurrently, and
+//     - it's safe to write to two linked_ptrs that point to the same
+//       shared object concurrently.
+// TODO(wan@google.com): rename this to safe_linked_ptr to avoid
+// confusion with normal linked_ptr.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+
+#include <stdlib.h>
+#include <assert.h>
+
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+namespace internal {
+
+// Protects copying of all linked_ptr objects.
+GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// This is used internally by all instances of linked_ptr<>.  It needs to be
+// a non-template class because different types of linked_ptr<> can refer to
+// the same object (linked_ptr<Superclass>(obj) vs linked_ptr<Subclass>(obj)).
+// So, it needs to be possible for different types of linked_ptr to participate
+// in the same circular linked list, so we need a single class type here.
+//
+// DO NOT USE THIS CLASS DIRECTLY YOURSELF.  Use linked_ptr<T>.
+class linked_ptr_internal {
+ public:
+  // Create a new circle that includes only this instance.
+  void join_new() {
+    next_ = this;
+  }
+
+  // Many linked_ptr operations may change p.link_ for some linked_ptr
+  // variable p in the same circle as this object.  Therefore we need
+  // to prevent two such operations from occurring concurrently.
+  //
+  // Note that different types of linked_ptr objects can coexist in a
+  // circle (e.g. linked_ptr<Base>, linked_ptr<Derived1>, and
+  // linked_ptr<Derived2>).  Therefore we must use a single mutex to
+  // protect all linked_ptr objects.  This can create serious
+  // contention in production code, but is acceptable in a testing
+  // framework.
+
+  // Join an existing circle.
+  void join(linked_ptr_internal const* ptr)
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
+    MutexLock lock(&g_linked_ptr_mutex);
+
+    linked_ptr_internal const* p = ptr;
+    while (p->next_ != ptr) {
+      assert(p->next_ != this &&
+             "Trying to join() a linked ring we are already in. "
+             "Is GMock thread safety enabled?");
+      p = p->next_;
+    }
+    p->next_ = this;
+    next_ = ptr;
+  }
+
+  // Leave whatever circle we're part of.  Returns true if we were the
+  // last member of the circle.  Once this is done, you can join() another.
+  bool depart()
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
+    MutexLock lock(&g_linked_ptr_mutex);
+
+    if (next_ == this) return true;
+    linked_ptr_internal const* p = next_;
+    while (p->next_ != this) {
+      assert(p->next_ != next_ &&
+             "Trying to depart() a linked ring we are not in. "
+             "Is GMock thread safety enabled?");
+      p = p->next_;
+    }
+    p->next_ = next_;
+    return false;
+  }
+
+ private:
+  mutable linked_ptr_internal const* next_;
+};
+
+template <typename T>
+class linked_ptr {
+ public:
+  typedef T element_type;
+
+  // Take over ownership of a raw pointer.  This should happen as soon as
+  // possible after the object is created.
+  explicit linked_ptr(T* ptr = NULL) { capture(ptr); }
+  ~linked_ptr() { depart(); }
+
+  // Copy an existing linked_ptr<>, adding ourselves to the list of references.
+  template <typename U> linked_ptr(linked_ptr<U> const& ptr) { copy(&ptr); }
+  linked_ptr(linked_ptr const& ptr) {  // NOLINT
+    assert(&ptr != this);
+    copy(&ptr);
+  }
+
+  // Assignment releases the old value and acquires the new.
+  template <typename U> linked_ptr& operator=(linked_ptr<U> const& ptr) {
+    depart();
+    copy(&ptr);
+    return *this;
+  }
+
+  linked_ptr& operator=(linked_ptr const& ptr) {
+    if (&ptr != this) {
+      depart();
+      copy(&ptr);
+    }
+    return *this;
+  }
+
+  // Smart pointer members.
+  void reset(T* ptr = NULL) {
+    depart();
+    capture(ptr);
+  }
+  T* get() const { return value_; }
+  T* operator->() const { return value_; }
+  T& operator*() const { return *value_; }
+
+  bool operator==(T* p) const { return value_ == p; }
+  bool operator!=(T* p) const { return value_ != p; }
+  template <typename U>
+  bool operator==(linked_ptr<U> const& ptr) const {
+    return value_ == ptr.get();
+  }
+  template <typename U>
+  bool operator!=(linked_ptr<U> const& ptr) const {
+    return value_ != ptr.get();
+  }
+
+ private:
+  template <typename U>
+  friend class linked_ptr;
+
+  T* value_;
+  linked_ptr_internal link_;
+
+  void depart() {
+    if (link_.depart()) delete value_;
+  }
+
+  void capture(T* ptr) {
+    value_ = ptr;
+    link_.join_new();
+  }
+
+  template <typename U> void copy(linked_ptr<U> const* ptr) {
+    value_ = ptr->get();
+    if (value_)
+      link_.join(&ptr->link_);
+    else
+      link_.join_new();
+  }
+};
+
+template<typename T> inline
+bool operator==(T* ptr, const linked_ptr<T>& x) {
+  return ptr == x.get();
+}
+
+template<typename T> inline
+bool operator!=(T* ptr, const linked_ptr<T>& x) {
+  return ptr != x.get();
+}
+
+// A function to convert T* into linked_ptr<T>
+// Doing e.g. make_linked_ptr(new FooBarBaz<type>(arg)) is a shorter notation
+// for linked_ptr<FooBarBaz<type> >(new FooBarBaz<type>(arg))
+template <typename T>
+linked_ptr<T> make_linked_ptr(T* ptr) {
+  return linked_ptr<T>(ptr);
+}
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h
new file mode 100644
index 0000000000..4d1d81d20f
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h
@@ -0,0 +1,5146 @@
+// This file was GENERATED by command:
+//     pump.py gtest-param-util-generated.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vladl@google.com (Vlad Losev)
+
+// Type and function utilities for implementing parameterized tests.
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently Google Test supports at most 50 arguments in Values,
+// and at most 10 arguments in Combine. Please contact
+// googletestframework@googlegroups.com if you need more.
+// Please note that the number of arguments to Combine is limited
+// by the maximum arity of the implementation of tuple which is
+// currently set at 10.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+#include "gtest/internal/gtest-param-util.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Forward declarations of ValuesIn(), which is implemented in
+// include/gtest/gtest-param-test.h.
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end);
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]);
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container);
+
+namespace internal {
+
+// Used in the Values() function to provide polymorphic capabilities.
+template <typename T1>
+class ValueArray1 {
+ public:
+  explicit ValueArray1(T1 v1) : v1_(v1) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray1& other);
+
+  const T1 v1_;
+};
+
+template <typename T1, typename T2>
+class ValueArray2 {
+ public:
+  ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray2& other);
+
+  const T1 v1_;
+  const T2 v2_;
+};
+
+template <typename T1, typename T2, typename T3>
+class ValueArray3 {
+ public:
+  ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray3& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+class ValueArray4 {
+ public:
+  ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray4& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class ValueArray5 {
+ public:
+  ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray5& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+class ValueArray6 {
+ public:
+  ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray6& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+class ValueArray7 {
+ public:
+  ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray7& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+class ValueArray8 {
+ public:
+  ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+      T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray8& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+class ValueArray9 {
+ public:
+  ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+      T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray9& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+class ValueArray10 {
+ public:
+  ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray10& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+class ValueArray11 {
+ public:
+  ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray11& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+class ValueArray12 {
+ public:
+  ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray12& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+class ValueArray13 {
+ public:
+  ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray13& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+class ValueArray14 {
+ public:
+  ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray14& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+class ValueArray15 {
+ public:
+  ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray15& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+class ValueArray16 {
+ public:
+  ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray16& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+class ValueArray17 {
+ public:
+  ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+      T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray17& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+class ValueArray18 {
+ public:
+  ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray18& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+class ValueArray19 {
+ public:
+  ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray19& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+class ValueArray20 {
+ public:
+  ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray20& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+class ValueArray21 {
+ public:
+  ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray21& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+class ValueArray22 {
+ public:
+  ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray22& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+class ValueArray23 {
+ public:
+  ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray23& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+class ValueArray24 {
+ public:
+  ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray24& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+class ValueArray25 {
+ public:
+  ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+      T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray25& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+class ValueArray26 {
+ public:
+  ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray26& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+class ValueArray27 {
+ public:
+  ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray27& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+class ValueArray28 {
+ public:
+  ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray28& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+class ValueArray29 {
+ public:
+  ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray29& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+class ValueArray30 {
+ public:
+  ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray30& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+class ValueArray31 {
+ public:
+  ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray31& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+class ValueArray32 {
+ public:
+  ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray32& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+class ValueArray33 {
+ public:
+  ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
+      T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray33& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+class ValueArray34 {
+ public:
+  ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray34& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+class ValueArray35 {
+ public:
+  ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
+      v32_(v32), v33_(v33), v34_(v34), v35_(v35) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray35& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+class ValueArray36 {
+ public:
+  ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
+      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray36& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+class ValueArray37 {
+ public:
+  ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
+      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
+      v36_(v36), v37_(v37) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray37& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+class ValueArray38 {
+ public:
+  ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray38& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+class ValueArray39 {
+ public:
+  ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray39& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+class ValueArray40 {
+ public:
+  ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
+      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
+      v40_(v40) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray40& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+class ValueArray41 {
+ public:
+  ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
+      T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray41& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+class ValueArray42 {
+ public:
+  ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray42& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+class ValueArray43 {
+ public:
+  ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
+      v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37),
+      v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray43& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+class ValueArray44 {
+ public:
+  ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
+      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36),
+      v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42),
+      v43_(v43), v44_(v44) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray44& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+class ValueArray45 {
+ public:
+  ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
+      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
+      v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41),
+      v42_(v42), v43_(v43), v44_(v44), v45_(v45) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray45& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+class ValueArray46 {
+ public:
+  ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
+      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray46& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+class ValueArray47 {
+ public:
+  ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
+      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46),
+      v47_(v47) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray47& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+class ValueArray48 {
+ public:
+  ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
+      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
+      v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45),
+      v46_(v46), v47_(v47), v48_(v48) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray48& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+class ValueArray49 {
+ public:
+  ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48,
+      T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
+      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray49& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+  const T49 v49_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+class ValueArray50 {
+ public:
+  ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49,
+      T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
+      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_), static_cast<T>(v50_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray50& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+  const T49 v49_;
+  const T50 v50_;
+};
+
+# if GTEST_HAS_COMBINE
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Generates values from the Cartesian product of values produced
+// by the argument generators.
+//
+template <typename T1, typename T2>
+class CartesianProductGenerator2
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2> > {
+ public:
+  typedef ::testing::tuple<T1, T2> ParamType;
+
+  CartesianProductGenerator2(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2)
+      : g1_(g1), g2_(g2) {}
+  virtual ~CartesianProductGenerator2() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current2_;
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator2::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator2& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+};  // class CartesianProductGenerator2
+
+
+template <typename T1, typename T2, typename T3>
+class CartesianProductGenerator3
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3> ParamType;
+
+  CartesianProductGenerator3(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3)
+      : g1_(g1), g2_(g2), g3_(g3) {}
+  virtual ~CartesianProductGenerator3() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current3_;
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator3::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator3& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+};  // class CartesianProductGenerator3
+
+
+template <typename T1, typename T2, typename T3, typename T4>
+class CartesianProductGenerator4
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4> ParamType;
+
+  CartesianProductGenerator4(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
+  virtual ~CartesianProductGenerator4() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current4_;
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator4::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator4& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+};  // class CartesianProductGenerator4
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class CartesianProductGenerator5
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5> ParamType;
+
+  CartesianProductGenerator5(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
+  virtual ~CartesianProductGenerator5() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current5_;
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator5::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator5& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+};  // class CartesianProductGenerator5
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+class CartesianProductGenerator6
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5,
+        T6> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6> ParamType;
+
+  CartesianProductGenerator6(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
+  virtual ~CartesianProductGenerator6() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current6_;
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator6::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator6& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+};  // class CartesianProductGenerator6
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+class CartesianProductGenerator7
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7> ParamType;
+
+  CartesianProductGenerator7(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
+  virtual ~CartesianProductGenerator7() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current7_;
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator7::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator7& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+};  // class CartesianProductGenerator7
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+class CartesianProductGenerator8
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8> ParamType;
+
+  CartesianProductGenerator8(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
+          g8_(g8) {}
+  virtual ~CartesianProductGenerator8() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current8_;
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator8::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator8& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+};  // class CartesianProductGenerator8
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+class CartesianProductGenerator9
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8, T9> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9> ParamType;
+
+  CartesianProductGenerator9(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9) {}
+  virtual ~CartesianProductGenerator9() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end(), g9_, g9_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8,
+      const ParamGenerator<T9>& g9,
+      const typename ParamGenerator<T9>::iterator& current9)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
+          begin9_(g9.begin()), end9_(g9.end()), current9_(current9)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current9_;
+      if (current9_ == end9_) {
+        current9_ = begin9_;
+        ++current8_;
+      }
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_ &&
+          current9_ == typed_other->current9_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_),
+        begin9_(other.begin9_),
+        end9_(other.end9_),
+        current9_(other.current9_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_,
+            *current9_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_ ||
+          current9_ == end9_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    const typename ParamGenerator<T9>::iterator begin9_;
+    const typename ParamGenerator<T9>::iterator end9_;
+    typename ParamGenerator<T9>::iterator current9_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator9::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator9& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+  const ParamGenerator<T9> g9_;
+};  // class CartesianProductGenerator9
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+class CartesianProductGenerator10
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8, T9, T10> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> ParamType;
+
+  CartesianProductGenerator10(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9,
+      const ParamGenerator<T10>& g10)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9), g10_(g10) {}
+  virtual ~CartesianProductGenerator10() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end(), g9_, g9_.end(), g10_, g10_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8,
+      const ParamGenerator<T9>& g9,
+      const typename ParamGenerator<T9>::iterator& current9,
+      const ParamGenerator<T10>& g10,
+      const typename ParamGenerator<T10>::iterator& current10)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
+          begin9_(g9.begin()), end9_(g9.end()), current9_(current9),
+          begin10_(g10.begin()), end10_(g10.end()), current10_(current10)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current10_;
+      if (current10_ == end10_) {
+        current10_ = begin10_;
+        ++current9_;
+      }
+      if (current9_ == end9_) {
+        current9_ = begin9_;
+        ++current8_;
+      }
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_ &&
+          current9_ == typed_other->current9_ &&
+          current10_ == typed_other->current10_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_),
+        begin9_(other.begin9_),
+        end9_(other.end9_),
+        current9_(other.current9_),
+        begin10_(other.begin10_),
+        end10_(other.end10_),
+        current10_(other.current10_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_,
+            *current9_, *current10_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_ ||
+          current9_ == end9_ ||
+          current10_ == end10_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    const typename ParamGenerator<T9>::iterator begin9_;
+    const typename ParamGenerator<T9>::iterator end9_;
+    typename ParamGenerator<T9>::iterator current9_;
+    const typename ParamGenerator<T10>::iterator begin10_;
+    const typename ParamGenerator<T10>::iterator end10_;
+    typename ParamGenerator<T10>::iterator current10_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator10::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator10& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+  const ParamGenerator<T9> g9_;
+  const ParamGenerator<T10> g10_;
+};  // class CartesianProductGenerator10
+
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Helper classes providing Combine() with polymorphic features. They allow
+// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is
+// convertible to U.
+//
+template <class Generator1, class Generator2>
+class CartesianProductHolder2 {
+ public:
+CartesianProductHolder2(const Generator1& g1, const Generator2& g2)
+      : g1_(g1), g2_(g2) {}
+  template <typename T1, typename T2>
+  operator ParamGenerator< ::testing::tuple<T1, T2> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2> >(
+        new CartesianProductGenerator2<T1, T2>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder2& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+};  // class CartesianProductHolder2
+
+template <class Generator1, class Generator2, class Generator3>
+class CartesianProductHolder3 {
+ public:
+CartesianProductHolder3(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3)
+      : g1_(g1), g2_(g2), g3_(g3) {}
+  template <typename T1, typename T2, typename T3>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3> >(
+        new CartesianProductGenerator3<T1, T2, T3>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder3& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+};  // class CartesianProductHolder3
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4>
+class CartesianProductHolder4 {
+ public:
+CartesianProductHolder4(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
+  template <typename T1, typename T2, typename T3, typename T4>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4> >(
+        new CartesianProductGenerator4<T1, T2, T3, T4>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder4& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+};  // class CartesianProductHolder4
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5>
+class CartesianProductHolder5 {
+ public:
+CartesianProductHolder5(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5> >(
+        new CartesianProductGenerator5<T1, T2, T3, T4, T5>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder5& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+};  // class CartesianProductHolder5
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6>
+class CartesianProductHolder6 {
+ public:
+CartesianProductHolder6(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6> >(
+        new CartesianProductGenerator6<T1, T2, T3, T4, T5, T6>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder6& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+};  // class CartesianProductHolder6
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7>
+class CartesianProductHolder7 {
+ public:
+CartesianProductHolder7(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+      T7> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7> >(
+        new CartesianProductGenerator7<T1, T2, T3, T4, T5, T6, T7>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder7& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+};  // class CartesianProductHolder7
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8>
+class CartesianProductHolder8 {
+ public:
+CartesianProductHolder8(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
+          g8_(g8) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7,
+      T8> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8> >(
+        new CartesianProductGenerator8<T1, T2, T3, T4, T5, T6, T7, T8>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder8& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+};  // class CartesianProductHolder8
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8, class Generator9>
+class CartesianProductHolder9 {
+ public:
+CartesianProductHolder9(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8,
+    const Generator9& g9)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8, typename T9>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+      T9> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+        T9> >(
+        new CartesianProductGenerator9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_),
+        static_cast<ParamGenerator<T9> >(g9_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder9& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+  const Generator9 g9_;
+};  // class CartesianProductHolder9
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8, class Generator9, class Generator10>
+class CartesianProductHolder10 {
+ public:
+CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8,
+    const Generator9& g9, const Generator10& g10)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9), g10_(g10) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8, typename T9, typename T10>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+      T10> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+        T10> >(
+        new CartesianProductGenerator10<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+            T10>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_),
+        static_cast<ParamGenerator<T9> >(g9_),
+        static_cast<ParamGenerator<T10> >(g10_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder10& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+  const Generator9 g9_;
+  const Generator10 g10_;
+};  // class CartesianProductHolder10
+
+# endif  // GTEST_HAS_COMBINE
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  //  GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump
new file mode 100644
index 0000000000..5c7c47af0b
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump
@@ -0,0 +1,286 @@
+$$ -*- mode: c++; -*-
+$var n = 50  $$ Maximum length of Values arguments we want to support.
+$var maxtuple = 10  $$ Maximum number of Combine arguments we want to support.
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vladl@google.com (Vlad Losev)
+
+// Type and function utilities for implementing parameterized tests.
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently Google Test supports at most $n arguments in Values,
+// and at most $maxtuple arguments in Combine. Please contact
+// googletestframework@googlegroups.com if you need more.
+// Please note that the number of arguments to Combine is limited
+// by the maximum arity of the implementation of tuple which is
+// currently set at $maxtuple.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+#include "gtest/internal/gtest-param-util.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Forward declarations of ValuesIn(), which is implemented in
+// include/gtest/gtest-param-test.h.
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end);
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]);
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container);
+
+namespace internal {
+
+// Used in the Values() function to provide polymorphic capabilities.
+$range i 1..n
+$for i [[
+$range j 1..i
+
+template <$for j, [[typename T$j]]>
+class ValueArray$i {
+ public:
+  $if i==1 [[explicit ]]ValueArray$i($for j, [[T$j v$j]]) : $for j, [[v$(j)_(v$j)]] {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {$for j, [[static_cast<T>(v$(j)_)]]};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray$i& other);
+
+$for j [[
+
+  const T$j v$(j)_;
+]]
+
+};
+
+]]
+
+# if GTEST_HAS_COMBINE
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Generates values from the Cartesian product of values produced
+// by the argument generators.
+//
+$range i 2..maxtuple
+$for i [[
+$range j 1..i
+$range k 2..i
+
+template <$for j, [[typename T$j]]>
+class CartesianProductGenerator$i
+    : public ParamGeneratorInterface< ::testing::tuple<$for j, [[T$j]]> > {
+ public:
+  typedef ::testing::tuple<$for j, [[T$j]]> ParamType;
+
+  CartesianProductGenerator$i($for j, [[const ParamGenerator<T$j>& g$j]])
+      : $for j, [[g$(j)_(g$j)]] {}
+  virtual ~CartesianProductGenerator$i() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, $for j, [[g$(j)_, g$(j)_.begin()]]);
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, $for j, [[g$(j)_, g$(j)_.end()]]);
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base, $for j, [[
+
+      const ParamGenerator<T$j>& g$j,
+      const typename ParamGenerator<T$j>::iterator& current$(j)]])
+        : base_(base),
+$for j, [[
+
+          begin$(j)_(g$j.begin()), end$(j)_(g$j.end()), current$(j)_(current$j)
+]]    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current$(i)_;
+
+$for k [[
+      if (current$(i+2-k)_ == end$(i+2-k)_) {
+        current$(i+2-k)_ = begin$(i+2-k)_;
+        ++current$(i+2-k-1)_;
+      }
+
+]]
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         ($for j  && [[
+
+          current$(j)_ == typed_other->current$(j)_
+]]);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_), $for j, [[
+
+        begin$(j)_(other.begin$(j)_),
+        end$(j)_(other.end$(j)_),
+        current$(j)_(other.current$(j)_)
+]] {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType($for j, [[*current$(j)_]]);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+$for j  || [[
+
+          current$(j)_ == end$(j)_
+]];
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+$for j [[
+
+    const typename ParamGenerator<T$j>::iterator begin$(j)_;
+    const typename ParamGenerator<T$j>::iterator end$(j)_;
+    typename ParamGenerator<T$j>::iterator current$(j)_;
+]]
+
+    ParamType current_value_;
+  };  // class CartesianProductGenerator$i::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator$i& other);
+
+
+$for j [[
+  const ParamGenerator<T$j> g$(j)_;
+
+]]
+};  // class CartesianProductGenerator$i
+
+
+]]
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Helper classes providing Combine() with polymorphic features. They allow
+// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is
+// convertible to U.
+//
+$range i 2..maxtuple
+$for i [[
+$range j 1..i
+
+template <$for j, [[class Generator$j]]>
+class CartesianProductHolder$i {
+ public:
+CartesianProductHolder$i($for j, [[const Generator$j& g$j]])
+      : $for j, [[g$(j)_(g$j)]] {}
+  template <$for j, [[typename T$j]]>
+  operator ParamGenerator< ::testing::tuple<$for j, [[T$j]]> >() const {
+    return ParamGenerator< ::testing::tuple<$for j, [[T$j]]> >(
+        new CartesianProductGenerator$i<$for j, [[T$j]]>(
+$for j,[[
+
+        static_cast<ParamGenerator<T$j> >(g$(j)_)
+]]));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder$i& other);
+
+
+$for j [[
+  const Generator$j g$(j)_;
+
+]]
+};  // class CartesianProductHolder$i
+
+]]
+
+# endif  // GTEST_HAS_COMBINE
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  //  GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
new file mode 100644
index 0000000000..82cab9b020
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
@@ -0,0 +1,731 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vladl@google.com (Vlad Losev)
+
+// Type and function utilities for implementing parameterized tests.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+
+#include <ctype.h>
+
+#include <iterator>
+#include <set>
+#include <utility>
+#include <vector>
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-linked_ptr.h"
+#include "gtest/internal/gtest-port.h"
+#include "gtest/gtest-printers.h"
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Input to a parameterized test name generator, describing a test parameter.
+// Consists of the parameter value and the integer parameter index.
+template <class ParamType>
+struct TestParamInfo {
+  TestParamInfo(const ParamType& a_param, size_t an_index) :
+    param(a_param),
+    index(an_index) {}
+  ParamType param;
+  size_t index;
+};
+
+// A builtin parameterized test name generator which returns the result of
+// testing::PrintToString.
+struct PrintToStringParamName {
+  template <class ParamType>
+  std::string operator()(const TestParamInfo<ParamType>& info) const {
+    return PrintToString(info.param);
+  }
+};
+
+namespace internal {
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Outputs a message explaining invalid registration of different
+// fixture class for the same test case. This may happen when
+// TEST_P macro is used to define two tests with the same name
+// but in different namespaces.
+GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name,
+                                          CodeLocation code_location);
+
+template <typename> class ParamGeneratorInterface;
+template <typename> class ParamGenerator;
+
+// Interface for iterating over elements provided by an implementation
+// of ParamGeneratorInterface<T>.
+template <typename T>
+class ParamIteratorInterface {
+ public:
+  virtual ~ParamIteratorInterface() {}
+  // A pointer to the base generator instance.
+  // Used only for the purposes of iterator comparison
+  // to make sure that two iterators belong to the same generator.
+  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
+  // Advances iterator to point to the next element
+  // provided by the generator. The caller is responsible
+  // for not calling Advance() on an iterator equal to
+  // BaseGenerator()->End().
+  virtual void Advance() = 0;
+  // Clones the iterator object. Used for implementing copy semantics
+  // of ParamIterator<T>.
+  virtual ParamIteratorInterface* Clone() const = 0;
+  // Dereferences the current iterator and provides (read-only) access
+  // to the pointed value. It is the caller's responsibility not to call
+  // Current() on an iterator equal to BaseGenerator()->End().
+  // Used for implementing ParamGenerator<T>::operator*().
+  virtual const T* Current() const = 0;
+  // Determines whether the given iterator and other point to the same
+  // element in the sequence generated by the generator.
+  // Used for implementing ParamGenerator<T>::operator==().
+  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
+};
+
+// Class iterating over elements provided by an implementation of
+// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
+// and implements the const forward iterator concept.
+template <typename T>
+class ParamIterator {
+ public:
+  typedef T value_type;
+  typedef const T& reference;
+  typedef ptrdiff_t difference_type;
+
+  // ParamIterator assumes ownership of the impl_ pointer.
+  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
+  ParamIterator& operator=(const ParamIterator& other) {
+    if (this != &other)
+      impl_.reset(other.impl_->Clone());
+    return *this;
+  }
+
+  const T& operator*() const { return *impl_->Current(); }
+  const T* operator->() const { return impl_->Current(); }
+  // Prefix version of operator++.
+  ParamIterator& operator++() {
+    impl_->Advance();
+    return *this;
+  }
+  // Postfix version of operator++.
+  ParamIterator operator++(int /*unused*/) {
+    ParamIteratorInterface<T>* clone = impl_->Clone();
+    impl_->Advance();
+    return ParamIterator(clone);
+  }
+  bool operator==(const ParamIterator& other) const {
+    return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
+  }
+  bool operator!=(const ParamIterator& other) const {
+    return !(*this == other);
+  }
+
+ private:
+  friend class ParamGenerator<T>;
+  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
+  scoped_ptr<ParamIteratorInterface<T> > impl_;
+};
+
+// ParamGeneratorInterface<T> is the binary interface to access generators
+// defined in other translation units.
+template <typename T>
+class ParamGeneratorInterface {
+ public:
+  typedef T ParamType;
+
+  virtual ~ParamGeneratorInterface() {}
+
+  // Generator interface definition
+  virtual ParamIteratorInterface<T>* Begin() const = 0;
+  virtual ParamIteratorInterface<T>* End() const = 0;
+};
+
+// Wraps ParamGeneratorInterface<T> and provides general generator syntax
+// compatible with the STL Container concept.
+// This class implements copy initialization semantics and the contained
+// ParamGeneratorInterface<T> instance is shared among all copies
+// of the original object. This is possible because that instance is immutable.
+template<typename T>
+class ParamGenerator {
+ public:
+  typedef ParamIterator<T> iterator;
+
+  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
+  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
+
+  ParamGenerator& operator=(const ParamGenerator& other) {
+    impl_ = other.impl_;
+    return *this;
+  }
+
+  iterator begin() const { return iterator(impl_->Begin()); }
+  iterator end() const { return iterator(impl_->End()); }
+
+ private:
+  linked_ptr<const ParamGeneratorInterface<T> > impl_;
+};
+
+// Generates values from a range of two comparable values. Can be used to
+// generate sequences of user-defined types that implement operator+() and
+// operator<().
+// This class is used in the Range() function.
+template <typename T, typename IncrementT>
+class RangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  RangeGenerator(T begin, T end, IncrementT step)
+      : begin_(begin), end_(end),
+        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
+  virtual ~RangeGenerator() {}
+
+  virtual ParamIteratorInterface<T>* Begin() const {
+    return new Iterator(this, begin_, 0, step_);
+  }
+  virtual ParamIteratorInterface<T>* End() const {
+    return new Iterator(this, end_, end_index_, step_);
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
+             IncrementT step)
+        : base_(base), value_(value), index_(index), step_(step) {}
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+      return base_;
+    }
+    virtual void Advance() {
+      value_ = static_cast<T>(value_ + step_);
+      index_++;
+    }
+    virtual ParamIteratorInterface<T>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const T* Current() const { return &value_; }
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const int other_index =
+          CheckedDowncastToActualType<const Iterator>(&other)->index_;
+      return index_ == other_index;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : ParamIteratorInterface<T>(),
+          base_(other.base_), value_(other.value_), index_(other.index_),
+          step_(other.step_) {}
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<T>* const base_;
+    T value_;
+    int index_;
+    const IncrementT step_;
+  };  // class RangeGenerator::Iterator
+
+  static int CalculateEndIndex(const T& begin,
+                               const T& end,
+                               const IncrementT& step) {
+    int end_index = 0;
+    for (T i = begin; i < end; i = static_cast<T>(i + step))
+      end_index++;
+    return end_index;
+  }
+
+  // No implementation - assignment is unsupported.
+  void operator=(const RangeGenerator& other);
+
+  const T begin_;
+  const T end_;
+  const IncrementT step_;
+  // The index for the end() iterator. All the elements in the generated
+  // sequence are indexed (0-based) to aid iterator comparison.
+  const int end_index_;
+};  // class RangeGenerator
+
+
+// Generates values from a pair of STL-style iterators. Used in the
+// ValuesIn() function. The elements are copied from the source range
+// since the source can be located on the stack, and the generator
+// is likely to persist beyond that stack frame.
+template <typename T>
+class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  template <typename ForwardIterator>
+  ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
+      : container_(begin, end) {}
+  virtual ~ValuesInIteratorRangeGenerator() {}
+
+  virtual ParamIteratorInterface<T>* Begin() const {
+    return new Iterator(this, container_.begin());
+  }
+  virtual ParamIteratorInterface<T>* End() const {
+    return new Iterator(this, container_.end());
+  }
+
+ private:
+  typedef typename ::std::vector<T> ContainerType;
+
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base,
+             typename ContainerType::const_iterator iterator)
+        : base_(base), iterator_(iterator) {}
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+      return base_;
+    }
+    virtual void Advance() {
+      ++iterator_;
+      value_.reset();
+    }
+    virtual ParamIteratorInterface<T>* Clone() const {
+      return new Iterator(*this);
+    }
+    // We need to use cached value referenced by iterator_ because *iterator_
+    // can return a temporary object (and of type other then T), so just
+    // having "return &*iterator_;" doesn't work.
+    // value_ is updated here and not in Advance() because Advance()
+    // can advance iterator_ beyond the end of the range, and we cannot
+    // detect that fact. The client code, on the other hand, is
+    // responsible for not calling Current() on an out-of-range iterator.
+    virtual const T* Current() const {
+      if (value_.get() == NULL)
+        value_.reset(new T(*iterator_));
+      return value_.get();
+    }
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      return iterator_ ==
+          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+          // The explicit constructor call suppresses a false warning
+          // emitted by gcc when supplied with the -Wextra option.
+        : ParamIteratorInterface<T>(),
+          base_(other.base_),
+          iterator_(other.iterator_) {}
+
+    const ParamGeneratorInterface<T>* const base_;
+    typename ContainerType::const_iterator iterator_;
+    // A cached value of *iterator_. We keep it here to allow access by
+    // pointer in the wrapping iterator's operator->().
+    // value_ needs to be mutable to be accessed in Current().
+    // Use of scoped_ptr helps manage cached value's lifetime,
+    // which is bound by the lifespan of the iterator itself.
+    mutable scoped_ptr<const T> value_;
+  };  // class ValuesInIteratorRangeGenerator::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const ValuesInIteratorRangeGenerator& other);
+
+  const ContainerType container_;
+};  // class ValuesInIteratorRangeGenerator
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Default parameterized test name generator, returns a string containing the
+// integer test parameter index.
+template <class ParamType>
+std::string DefaultParamName(const TestParamInfo<ParamType>& info) {
+  Message name_stream;
+  name_stream << info.index;
+  return name_stream.GetString();
+}
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Parameterized test name overload helpers, which help the
+// INSTANTIATE_TEST_CASE_P macro choose between the default parameterized
+// test name generator and user param name generator.
+template <class ParamType, class ParamNameGenFunctor>
+ParamNameGenFunctor GetParamNameGen(ParamNameGenFunctor func) {
+  return func;
+}
+
+template <class ParamType>
+struct ParamNameGenFunc {
+  typedef std::string Type(const TestParamInfo<ParamType>&);
+};
+
+template <class ParamType>
+typename ParamNameGenFunc<ParamType>::Type *GetParamNameGen() {
+  return DefaultParamName;
+}
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Stores a parameter value and later creates tests parameterized with that
+// value.
+template <class TestClass>
+class ParameterizedTestFactory : public TestFactoryBase {
+ public:
+  typedef typename TestClass::ParamType ParamType;
+  explicit ParameterizedTestFactory(ParamType parameter) :
+      parameter_(parameter) {}
+  virtual Test* CreateTest() {
+    TestClass::SetParam(&parameter_);
+    return new TestClass();
+  }
+
+ private:
+  const ParamType parameter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactoryBase is a base class for meta-factories that create
+// test factories for passing into MakeAndRegisterTestInfo function.
+template <class ParamType>
+class TestMetaFactoryBase {
+ public:
+  virtual ~TestMetaFactoryBase() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactory creates test factories for passing into
+// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
+// ownership of test factory pointer, same factory object cannot be passed
+// into that method twice. But ParameterizedTestCaseInfo is going to call
+// it for each Test/Parameter value combination. Thus it needs meta factory
+// creator class.
+template <class TestCase>
+class TestMetaFactory
+    : public TestMetaFactoryBase<typename TestCase::ParamType> {
+ public:
+  typedef typename TestCase::ParamType ParamType;
+
+  TestMetaFactory() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) {
+    return new ParameterizedTestFactory<TestCase>(parameter);
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseInfoBase is a generic interface
+// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase
+// accumulates test information provided by TEST_P macro invocations
+// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations
+// and uses that information to register all resulting test instances
+// in RegisterTests method. The ParameterizeTestCaseRegistry class holds
+// a collection of pointers to the ParameterizedTestCaseInfo objects
+// and calls RegisterTests() on each of them when asked.
+class ParameterizedTestCaseInfoBase {
+ public:
+  virtual ~ParameterizedTestCaseInfoBase() {}
+
+  // Base part of test case name for display purposes.
+  virtual const string& GetTestCaseName() const = 0;
+  // Test case id to verify identity.
+  virtual TypeId GetTestCaseTypeId() const = 0;
+  // UnitTest class invokes this method to register tests in this
+  // test case right before running them in RUN_ALL_TESTS macro.
+  // This method should not be called more then once on any single
+  // instance of a ParameterizedTestCaseInfoBase derived class.
+  virtual void RegisterTests() = 0;
+
+ protected:
+  ParameterizedTestCaseInfoBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P
+// macro invocations for a particular test case and generators
+// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that
+// test case. It registers tests with all values generated by all
+// generators when asked.
+template <class TestCase>
+class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
+ public:
+  // ParamType and GeneratorCreationFunc are private types but are required
+  // for declarations of public methods AddTestPattern() and
+  // AddTestCaseInstantiation().
+  typedef typename TestCase::ParamType ParamType;
+  // A function that returns an instance of appropriate generator type.
+  typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
+  typedef typename ParamNameGenFunc<ParamType>::Type ParamNameGeneratorFunc;
+
+  explicit ParameterizedTestCaseInfo(
+      const char* name, CodeLocation code_location)
+      : test_case_name_(name), code_location_(code_location) {}
+
+  // Test case base name for display purposes.
+  virtual const string& GetTestCaseName() const { return test_case_name_; }
+  // Test case id to verify identity.
+  virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
+  // TEST_P macro uses AddTestPattern() to record information
+  // about a single test in a LocalTestInfo structure.
+  // test_case_name is the base name of the test case (without invocation
+  // prefix). test_base_name is the name of an individual test without
+  // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
+  // test case base name and DoBar is test base name.
+  void AddTestPattern(const char* test_case_name,
+                      const char* test_base_name,
+                      TestMetaFactoryBase<ParamType>* meta_factory) {
+    tests_.push_back(linked_ptr<TestInfo>(new TestInfo(test_case_name,
+                                                       test_base_name,
+                                                       meta_factory)));
+  }
+  // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information
+  // about a generator.
+  int AddTestCaseInstantiation(const string& instantiation_name,
+                               GeneratorCreationFunc* func,
+                               ParamNameGeneratorFunc* name_func,
+                               const char* file,
+                               int line) {
+    instantiations_.push_back(
+        InstantiationInfo(instantiation_name, func, name_func, file, line));
+    return 0;  // Return value used only to run this method in namespace scope.
+  }
+  // UnitTest class invokes this method to register tests in this test case
+  // test cases right before running tests in RUN_ALL_TESTS macro.
+  // This method should not be called more then once on any single
+  // instance of a ParameterizedTestCaseInfoBase derived class.
+  // UnitTest has a guard to prevent from calling this method more then once.
+  virtual void RegisterTests() {
+    for (typename TestInfoContainer::iterator test_it = tests_.begin();
+         test_it != tests_.end(); ++test_it) {
+      linked_ptr<TestInfo> test_info = *test_it;
+      for (typename InstantiationContainer::iterator gen_it =
+               instantiations_.begin(); gen_it != instantiations_.end();
+               ++gen_it) {
+        const string& instantiation_name = gen_it->name;
+        ParamGenerator<ParamType> generator((*gen_it->generator)());
+        ParamNameGeneratorFunc* name_func = gen_it->name_func;
+        const char* file = gen_it->file;
+        int line = gen_it->line;
+
+        string test_case_name;
+        if ( !instantiation_name.empty() )
+          test_case_name = instantiation_name + "/";
+        test_case_name += test_info->test_case_base_name;
+
+        size_t i = 0;
+        std::set<std::string> test_param_names;
+        for (typename ParamGenerator<ParamType>::iterator param_it =
+                 generator.begin();
+             param_it != generator.end(); ++param_it, ++i) {
+          Message test_name_stream;
+
+          std::string param_name = name_func(
+              TestParamInfo<ParamType>(*param_it, i));
+
+          GTEST_CHECK_(IsValidParamName(param_name))
+              << "Parameterized test name '" << param_name
+              << "' is invalid, in " << file
+              << " line " << line << std::endl;
+
+          GTEST_CHECK_(test_param_names.count(param_name) == 0)
+              << "Duplicate parameterized test name '" << param_name
+              << "', in " << file << " line " << line << std::endl;
+
+          test_param_names.insert(param_name);
+
+          test_name_stream << test_info->test_base_name << "/" << param_name;
+          MakeAndRegisterTestInfo(
+              test_case_name.c_str(),
+              test_name_stream.GetString().c_str(),
+              NULL,  // No type parameter.
+              PrintToString(*param_it).c_str(),
+              code_location_,
+              GetTestCaseTypeId(),
+              TestCase::SetUpTestCase,
+              TestCase::TearDownTestCase,
+              test_info->test_meta_factory->CreateTestFactory(*param_it));
+        }  // for param_it
+      }  // for gen_it
+    }  // for test_it
+  }  // RegisterTests
+
+ private:
+  // LocalTestInfo structure keeps information about a single test registered
+  // with TEST_P macro.
+  struct TestInfo {
+    TestInfo(const char* a_test_case_base_name,
+             const char* a_test_base_name,
+             TestMetaFactoryBase<ParamType>* a_test_meta_factory) :
+        test_case_base_name(a_test_case_base_name),
+        test_base_name(a_test_base_name),
+        test_meta_factory(a_test_meta_factory) {}
+
+    const string test_case_base_name;
+    const string test_base_name;
+    const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+  };
+  typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
+  // Records data received from INSTANTIATE_TEST_CASE_P macros:
+  //  <Instantiation name, Sequence generator creation function,
+  //     Name generator function, Source file, Source line>
+  struct InstantiationInfo {
+      InstantiationInfo(const std::string &name_in,
+                        GeneratorCreationFunc* generator_in,
+                        ParamNameGeneratorFunc* name_func_in,
+                        const char* file_in,
+                        int line_in)
+          : name(name_in),
+            generator(generator_in),
+            name_func(name_func_in),
+            file(file_in),
+            line(line_in) {}
+
+      std::string name;
+      GeneratorCreationFunc* generator;
+      ParamNameGeneratorFunc* name_func;
+      const char* file;
+      int line;
+  };
+  typedef ::std::vector<InstantiationInfo> InstantiationContainer;
+
+  static bool IsValidParamName(const std::string& name) {
+    // Check for empty string
+    if (name.empty())
+      return false;
+
+    // Check for invalid characters
+    for (std::string::size_type index = 0; index < name.size(); ++index) {
+      if (!isalnum(name[index]) && name[index] != '_')
+        return false;
+    }
+
+    return true;
+  }
+
+  const string test_case_name_;
+  CodeLocation code_location_;
+  TestInfoContainer tests_;
+  InstantiationContainer instantiations_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo);
+};  // class ParameterizedTestCaseInfo
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase
+// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P
+// macros use it to locate their corresponding ParameterizedTestCaseInfo
+// descriptors.
+class ParameterizedTestCaseRegistry {
+ public:
+  ParameterizedTestCaseRegistry() {}
+  ~ParameterizedTestCaseRegistry() {
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      delete *it;
+    }
+  }
+
+  // Looks up or creates and returns a structure containing information about
+  // tests and instantiations of a particular test case.
+  template <class TestCase>
+  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
+      const char* test_case_name,
+      CodeLocation code_location) {
+    ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL;
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      if ((*it)->GetTestCaseName() == test_case_name) {
+        if ((*it)->GetTestCaseTypeId() != GetTypeId<TestCase>()) {
+          // Complain about incorrect usage of Google Test facilities
+          // and terminate the program since we cannot guaranty correct
+          // test case setup and tear-down in this case.
+          ReportInvalidTestCaseType(test_case_name, code_location);
+          posix::Abort();
+        } else {
+          // At this point we are sure that the object we found is of the same
+          // type we are looking for, so we downcast it to that type
+          // without further checks.
+          typed_test_info = CheckedDowncastToActualType<
+              ParameterizedTestCaseInfo<TestCase> >(*it);
+        }
+        break;
+      }
+    }
+    if (typed_test_info == NULL) {
+      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(
+          test_case_name, code_location);
+      test_case_infos_.push_back(typed_test_info);
+    }
+    return typed_test_info;
+  }
+  void RegisterTests() {
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      (*it)->RegisterTests();
+    }
+  }
+
+ private:
+  typedef ::std::vector<ParameterizedTestCaseInfoBase*> TestCaseInfoContainer;
+
+  TestCaseInfoContainer test_case_infos_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry);
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  //  GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
new file mode 100644
index 0000000000..74ab949057
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
@@ -0,0 +1,93 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the GTEST_OS_* macro.
+// It is separate from gtest-port.h so that custom/gtest-port.h can include it.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+
+// Determines the platform on which Google Test is compiled.
+#ifdef __CYGWIN__
+# define GTEST_OS_CYGWIN 1
+#elif defined __SYMBIAN32__
+# define GTEST_OS_SYMBIAN 1
+#elif defined _WIN32
+# define GTEST_OS_WINDOWS 1
+# ifdef _WIN32_WCE
+#  define GTEST_OS_WINDOWS_MOBILE 1
+# elif defined(__MINGW__) || defined(__MINGW32__)
+#  define GTEST_OS_WINDOWS_MINGW 1
+# elif defined(WINAPI_FAMILY)
+#  include <winapifamily.h>
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#   define GTEST_OS_WINDOWS_DESKTOP 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#   define GTEST_OS_WINDOWS_PHONE 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#   define GTEST_OS_WINDOWS_RT 1
+#  else
+    // WINAPI_FAMILY defined but no known partition matched.
+    // Default to desktop.
+#   define GTEST_OS_WINDOWS_DESKTOP 1
+#  endif
+# else
+#  define GTEST_OS_WINDOWS_DESKTOP 1
+# endif  // _WIN32_WCE
+#elif defined __APPLE__
+# define GTEST_OS_MAC 1
+# if TARGET_OS_IPHONE
+#  define GTEST_OS_IOS 1
+# endif
+#elif defined __FreeBSD__
+# define GTEST_OS_FREEBSD 1
+#elif defined __linux__
+# define GTEST_OS_LINUX 1
+# if defined __ANDROID__
+#  define GTEST_OS_LINUX_ANDROID 1
+# endif
+#elif defined __MVS__
+# define GTEST_OS_ZOS 1
+#elif defined(__sun) && defined(__SVR4)
+# define GTEST_OS_SOLARIS 1
+#elif defined(_AIX)
+# define GTEST_OS_AIX 1
+#elif defined(__hpux)
+# define GTEST_OS_HPUX 1
+#elif defined __native_client__
+# define GTEST_OS_NACL 1
+#elif defined __OpenBSD__
+# define GTEST_OS_OPENBSD 1
+#elif defined __QNX__
+# define GTEST_OS_QNX 1
+#endif  // __CYGWIN__
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
new file mode 100644
index 0000000000..da57e65d33
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
@@ -0,0 +1,2567 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan)
+//
+// Low-level types and utilities for porting Google Test to various
+// platforms.  All macros ending with _ and symbols defined in an
+// internal namespace are subject to change without notice.  Code
+// outside Google Test MUST NOT USE THEM DIRECTLY.  Macros that don't
+// end with _ are part of Google Test's public API and can be used by
+// code outside Google Test.
+//
+// This file is fundamental to Google Test.  All other Google Test source
+// files are expected to #include this.  Therefore, it cannot #include
+// any other Google Test header.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+// Environment-describing macros
+// -----------------------------
+//
+// Google Test can be used in many different environments.  Macros in
+// this section tell Google Test what kind of environment it is being
+// used in, such that Google Test can provide environment-specific
+// features and implementations.
+//
+// Google Test tries to automatically detect the properties of its
+// environment, so users usually don't need to worry about these
+// macros.  However, the automatic detection is not perfect.
+// Sometimes it's necessary for a user to define some of the following
+// macros in the build script to override Google Test's decisions.
+//
+// If the user doesn't define a macro in the list, Google Test will
+// provide a default definition.  After this header is #included, all
+// macros in this list will be defined to either 1 or 0.
+//
+// Notes to maintainers:
+//   - Each macro here is a user-tweakable knob; do not grow the list
+//     lightly.
+//   - Use #if to key off these macros.  Don't use #ifdef or "#if
+//     defined(...)", which will not work as these macros are ALWAYS
+//     defined.
+//
+//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
+//                              is/isn't available.
+//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
+//                              are enabled.
+//   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
+//                              is/isn't available (some systems define
+//                              ::string, which is different to std::string).
+//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
+//                              is/isn't available (some systems define
+//                              ::wstring, which is different to std::wstring).
+//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
+//                              expressions are/aren't available.
+//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
+//                              is/isn't available.
+//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
+//                              enabled.
+//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
+//                              std::wstring does/doesn't work (Google Test can
+//                              be used where std::wstring is unavailable).
+//   GTEST_HAS_TR1_TUPLE      - Define it to 1/0 to indicate tr1::tuple
+//                              is/isn't available.
+//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
+//                              compiler supports Microsoft's "Structured
+//                              Exception Handling".
+//   GTEST_HAS_STREAM_REDIRECTION
+//                            - Define it to 1/0 to indicate whether the
+//                              platform supports I/O stream redirection using
+//                              dup() and dup2().
+//   GTEST_USE_OWN_TR1_TUPLE  - Define it to 1/0 to indicate whether Google
+//                              Test's own tr1 tuple implementation should be
+//                              used.  Unused when the user sets
+//                              GTEST_HAS_TR1_TUPLE to 0.
+//   GTEST_LANG_CXX11         - Define it to 1/0 to indicate that Google Test
+//                              is building in C++11/C++98 mode.
+//   GTEST_LINKED_AS_SHARED_LIBRARY
+//                            - Define to 1 when compiling tests that use
+//                              Google Test as a shared library (known as
+//                              DLL on Windows).
+//   GTEST_CREATE_SHARED_LIBRARY
+//                            - Define to 1 when compiling Google Test itself
+//                              as a shared library.
+
+// Platform-indicating macros
+// --------------------------
+//
+// Macros indicating the platform on which Google Test is being used
+// (a macro is defined to 1 if compiled on the given platform;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+//   GTEST_OS_AIX      - IBM AIX
+//   GTEST_OS_CYGWIN   - Cygwin
+//   GTEST_OS_FREEBSD  - FreeBSD
+//   GTEST_OS_HPUX     - HP-UX
+//   GTEST_OS_LINUX    - Linux
+//     GTEST_OS_LINUX_ANDROID - Google Android
+//   GTEST_OS_MAC      - Mac OS X
+//     GTEST_OS_IOS    - iOS
+//   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_OPENBSD  - OpenBSD
+//   GTEST_OS_QNX      - QNX
+//   GTEST_OS_SOLARIS  - Sun Solaris
+//   GTEST_OS_SYMBIAN  - Symbian
+//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
+//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
+//     GTEST_OS_WINDOWS_MINGW    - MinGW
+//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
+//     GTEST_OS_WINDOWS_PHONE    - Windows Phone
+//     GTEST_OS_WINDOWS_RT       - Windows Store App/WinRT
+//   GTEST_OS_ZOS      - z/OS
+//
+// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
+// most stable support.  Since core members of the Google Test project
+// don't have access to other platforms, support for them may be less
+// stable.  If you notice any problems on your platform, please notify
+// googletestframework@googlegroups.com (patches for fixing them are
+// even more welcome!).
+//
+// It is possible that none of the GTEST_OS_* macros are defined.
+
+// Feature-indicating macros
+// -------------------------
+//
+// Macros indicating which Google Test features are available (a macro
+// is defined to 1 if the corresponding feature is supported;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+// These macros are public so that portable tests can be written.
+// Such tests typically surround code using a feature with an #if
+// which controls that code.  For example:
+//
+// #if GTEST_HAS_DEATH_TEST
+//   EXPECT_DEATH(DoSomethingDeadly());
+// #endif
+//
+//   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
+//                            tests)
+//   GTEST_HAS_DEATH_TEST   - death tests
+//   GTEST_HAS_PARAM_TEST   - value-parameterized tests
+//   GTEST_HAS_TYPED_TEST   - typed tests
+//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
+//   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
+//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
+//                            GTEST_HAS_POSIX_RE (see above) which users can
+//                            define themselves.
+//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
+//                            the above two are mutually exclusive.
+//   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
+
+// Misc public macros
+// ------------------
+//
+//   GTEST_FLAG(flag_name)  - references the variable corresponding to
+//                            the given Google Test flag.
+
+// Internal utilities
+// ------------------
+//
+// The following macros and utilities are for Google Test's INTERNAL
+// use only.  Code outside Google Test MUST NOT USE THEM DIRECTLY.
+//
+// Macros for basic C++ coding:
+//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
+//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
+//                              variable don't have to be used.
+//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
+//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
+//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
+//   GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
+//                                        suppressed (constant conditional).
+//   GTEST_INTENTIONAL_CONST_COND_POP_  - finish code section where MSVC C4127
+//                                        is suppressed.
+//
+// C++11 feature wrappers:
+//
+//   testing::internal::move  - portability wrapper for std::move.
+//
+// Synchronization:
+//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
+//                            - synchronization primitives.
+//
+// Template meta programming:
+//   is_pointer     - as in TR1; needed on Symbian and IBM XL C/C++ only.
+//   IteratorTraits - partial implementation of std::iterator_traits, which
+//                    is not available in libCstd when compiled with Sun C++.
+//
+// Smart pointers:
+//   scoped_ptr     - as in TR2.
+//
+// Regular expressions:
+//   RE             - a simple regular expression class using the POSIX
+//                    Extended Regular Expression syntax on UNIX-like
+//                    platforms, or a reduced regular exception syntax on
+//                    other platforms, including Windows.
+//
+// Logging:
+//   GTEST_LOG_()   - logs messages at the specified severity level.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+//
+// Stdout and stderr capturing:
+//   CaptureStdout()     - starts capturing stdout.
+//   GetCapturedStdout() - stops capturing stdout and returns the captured
+//                         string.
+//   CaptureStderr()     - starts capturing stderr.
+//   GetCapturedStderr() - stops capturing stderr and returns the captured
+//                         string.
+//
+// Integer types:
+//   TypeWithSize   - maps an integer to a int type.
+//   Int32, UInt32, Int64, UInt64, TimeInMillis
+//                  - integers of known sizes.
+//   BiggestInt     - the biggest signed integer type.
+//
+// Command-line utilities:
+//   GTEST_DECLARE_*()  - declares a flag.
+//   GTEST_DEFINE_*()   - defines a flag.
+//   GetInjectableArgvs() - returns the command line as a vector of strings.
+//
+// Environment variable utilities:
+//   GetEnv()             - gets the value of an environment variable.
+//   BoolFromGTestEnv()   - parses a bool environment variable.
+//   Int32FromGTestEnv()  - parses an Int32 environment variable.
+//   StringFromGTestEnv() - parses a string environment variable.
+
+#include <ctype.h>   // for isspace, etc
+#include <stddef.h>  // for ptrdiff_t
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifndef _WIN32_WCE
+# include <sys/types.h>
+# include <sys/stat.h>
+#endif  // !_WIN32_WCE
+
+#if defined __APPLE__
+# include <AvailabilityMacros.h>
+# include <TargetConditionals.h>
+#endif
+
+#include <algorithm>  // NOLINT
+#include <iostream>  // NOLINT
+#include <sstream>  // NOLINT
+#include <string>  // NOLINT
+#include <utility>
+#include <vector>  // NOLINT
+
+#include "gtest/internal/gtest-port-arch.h"
+#include "gtest/internal/custom/gtest-port.h"
+
+#if !defined(GTEST_DEV_EMAIL_)
+# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+# define GTEST_FLAG_PREFIX_ "gtest_"
+# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+# define GTEST_NAME_ "Google Test"
+# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#endif  // !defined(GTEST_DEV_EMAIL_)
+
+#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+
+// Determines the version of gcc that is used to compile this.
+#ifdef __GNUC__
+// 40302 means version 4.3.2.
+# define GTEST_GCC_VER_ \
+    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#endif  // __GNUC__
+
+// Macros for disabling Microsoft Visual C++ warnings.
+//
+//   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
+//   /* code that triggers warnings C4800 and C4385 */
+//   GTEST_DISABLE_MSC_WARNINGS_POP_()
+#if _MSC_VER >= 1500
+# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+    __pragma(warning(push))                        \
+    __pragma(warning(disable: warnings))
+# define GTEST_DISABLE_MSC_WARNINGS_POP_()          \
+    __pragma(warning(pop))
+#else
+// Older versions of MSVC don't have __pragma.
+# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+# define GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+#ifndef GTEST_LANG_CXX11
+// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
+// -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
+// value for __cplusplus, and recent versions of clang, gcc, and
+// probably other compilers set that too in C++11 mode.
+# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
+// Compiling in at least C++11 mode.
+#  define GTEST_LANG_CXX11 1
+# else
+#  define GTEST_LANG_CXX11 0
+# endif
+#endif
+
+// Distinct from C++11 language support, some environments don't provide
+// proper C++11 library support. Notably, it's possible to build in
+// C++11 mode when targeting Mac OS X 10.6, which has an old libstdc++
+// with no C++11 support.
+//
+// libstdc++ has sufficient C++11 support as of GCC 4.6.0, __GLIBCXX__
+// 20110325, but maintenance releases in the 4.4 and 4.5 series followed
+// this date, so check for those versions by their date stamps.
+// https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html#abi.versioning
+#if GTEST_LANG_CXX11 && \
+    (!defined(__GLIBCXX__) || ( \
+        __GLIBCXX__ >= 20110325ul &&  /* GCC >= 4.6.0 */ \
+        /* Blacklist of patch releases of older branches: */ \
+        __GLIBCXX__ != 20110416ul &&  /* GCC 4.4.6 */ \
+        __GLIBCXX__ != 20120313ul &&  /* GCC 4.4.7 */ \
+        __GLIBCXX__ != 20110428ul &&  /* GCC 4.5.3 */ \
+        __GLIBCXX__ != 20120702ul))   /* GCC 4.5.4 */
+# define GTEST_STDLIB_CXX11 1
+#endif
+
+// Only use C++11 library features if the library provides them.
+#if GTEST_STDLIB_CXX11
+# define GTEST_HAS_STD_BEGIN_AND_END_ 1
+# define GTEST_HAS_STD_FORWARD_LIST_ 1
+# define GTEST_HAS_STD_FUNCTION_ 1
+# define GTEST_HAS_STD_INITIALIZER_LIST_ 1
+# define GTEST_HAS_STD_MOVE_ 1
+# define GTEST_HAS_STD_SHARED_PTR_ 1
+# define GTEST_HAS_STD_TYPE_TRAITS_ 1
+# define GTEST_HAS_STD_UNIQUE_PTR_ 1
+#endif
+
+// C++11 specifies that <tuple> provides std::tuple.
+// Some platforms still might not have it, however.
+#if GTEST_LANG_CXX11
+# define GTEST_HAS_STD_TUPLE_ 1
+# if defined(__clang__)
+// Inspired by http://clang.llvm.org/docs/LanguageExtensions.html#__has_include
+#  if defined(__has_include) && !__has_include(<tuple>)
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# elif defined(_MSC_VER)
+// Inspired by boost/config/stdlib/dinkumware.hpp
+#  if defined(_CPPLIB_VER) && _CPPLIB_VER < 520
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# elif defined(__GLIBCXX__)
+// Inspired by boost/config/stdlib/libstdcpp3.hpp,
+// http://gcc.gnu.org/gcc-4.2/changes.html and
+// http://gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x
+#  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2)
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# endif
+#endif
+
+// Brings in definitions for functions used in the testing::internal::posix
+// namespace (read, write, close, chdir, isatty, stat). We do not currently
+// use them on Windows Mobile.
+#if GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS_MOBILE
+#  include <direct.h>
+#  include <io.h>
+# endif
+// In order to avoid having to include <windows.h>, use forward declaration
+// assuming CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
+// This assumption is verified by
+// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
+struct _RTL_CRITICAL_SECTION;
+#else
+// This assumes that non-Windows OSes provide unistd.h. For OSes where this
+// is not the case, we need to include headers that provide the functions
+// mentioned above.
+# include <unistd.h>
+# include <strings.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_LINUX_ANDROID
+// Used to define __ANDROID_API__ matching the target NDK API level.
+#  include <android/api-level.h>  // NOLINT
+#endif
+
+// Defines this to true iff Google Test can use POSIX regular expressions.
+#ifndef GTEST_HAS_POSIX_RE
+# if GTEST_OS_LINUX_ANDROID
+// On Android, <regex.h> is only available starting with Gingerbread.
+#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+# else
+#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+# endif
+#endif
+
+#if GTEST_USES_PCRE
+// The appropriate headers have already been included.
+
+#elif GTEST_HAS_POSIX_RE
+
+// On some platforms, <regex.h> needs someone to define size_t, and
+// won't compile otherwise.  We can #include it here as we already
+// included <stdlib.h>, which is guaranteed to define size_t through
+// <stddef.h>.
+# include <regex.h>  // NOLINT
+
+# define GTEST_USES_POSIX_RE 1
+
+#elif GTEST_OS_WINDOWS
+
+// <regex.h> is not available on Windows.  Use our own simple regex
+// implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#else
+
+// <regex.h> may not be available on this platform.  Use our own
+// simple regex implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#endif  // GTEST_USES_PCRE
+
+#ifndef GTEST_HAS_EXCEPTIONS
+// The user didn't tell us whether exceptions are enabled, so we need
+// to figure it out.
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
+// macro to enable exceptions, so we'll do the same.
+// Assumes that exceptions are enabled by default.
+#  ifndef _HAS_EXCEPTIONS
+#   define _HAS_EXCEPTIONS 1
+#  endif  // _HAS_EXCEPTIONS
+#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+# elif defined(__clang__)
+// clang defines __EXCEPTIONS iff exceptions are enabled before clang 220714,
+// but iff cleanups are enabled after that. In Obj-C++ files, there can be
+// cleanups for ObjC exceptions which also need cleanups, even if C++ exceptions
+// are disabled. clang has __has_feature(cxx_exceptions) which checks for C++
+// exceptions starting at clang r206352, but which checked for cleanups prior to
+// that. To reliably check for C++ exception availability with clang, check for
+// __EXCEPTIONS && __has_feature(cxx_exceptions).
+#  define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+# elif defined(__GNUC__) && __EXCEPTIONS
+// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__SUNPRO_CC)
+// Sun Pro CC supports exceptions.  However, there is no compile-time way of
+// detecting whether they are enabled or not.  Therefore, we assume that
+// they are enabled unless the user tells us otherwise.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__IBMCPP__) && __EXCEPTIONS
+// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__HP_aCC)
+// Exception handling is in effect by default in HP aCC compiler. It has to
+// be turned of by +noeh compiler option if desired.
+#  define GTEST_HAS_EXCEPTIONS 1
+# else
+// For other compilers, we assume exceptions are disabled to be
+// conservative.
+#  define GTEST_HAS_EXCEPTIONS 0
+# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#if !defined(GTEST_HAS_STD_STRING)
+// Even though we don't use this macro any longer, we keep it in case
+// some clients still depend on it.
+# define GTEST_HAS_STD_STRING 1
+#elif !GTEST_HAS_STD_STRING
+// The user told us that ::std::string isn't available.
+# error "Google Test cannot be used where ::std::string isn't available."
+#endif  // !defined(GTEST_HAS_STD_STRING)
+
+#ifndef GTEST_HAS_GLOBAL_STRING
+// The user didn't tell us whether ::string is available, so we need
+// to figure it out.
+
+# define GTEST_HAS_GLOBAL_STRING 0
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#ifndef GTEST_HAS_STD_WSTRING
+// The user didn't tell us whether ::std::wstring is available, so we need
+// to figure it out.
+// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
+//   is available.
+
+// Cygwin 1.7 and below doesn't support ::std::wstring.
+// Solaris' libc++ doesn't support it either.  Android has
+// no support for it at least as recent as Froyo (2.2).
+# define GTEST_HAS_STD_WSTRING \
+    (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS))
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+#ifndef GTEST_HAS_GLOBAL_WSTRING
+// The user didn't tell us whether ::wstring is available, so we need
+// to figure it out.
+# define GTEST_HAS_GLOBAL_WSTRING \
+    (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING)
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Determines whether RTTI is available.
+#ifndef GTEST_HAS_RTTI
+// The user didn't tell us whether RTTI is enabled, so we need to
+// figure it out.
+
+# ifdef _MSC_VER
+
+#  ifdef _CPPRTTI  // MSVC defines this macro iff RTTI is enabled.
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
+
+#  ifdef __GXX_RTTI
+// When building against STLport with the Android NDK and with
+// -frtti -fno-exceptions, the build fails at link time with undefined
+// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
+// so disable RTTI when detected.
+#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
+       !defined(__EXCEPTIONS)
+#    define GTEST_HAS_RTTI 0
+#   else
+#    define GTEST_HAS_RTTI 1
+#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif  // __GXX_RTTI
+
+// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
+// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
+// first version with C++ support.
+# elif defined(__clang__)
+
+#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+
+// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
+// both the typeid and dynamic_cast features are present.
+# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+
+#  ifdef __RTTI_ALL__
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+# else
+
+// For all other compilers, we assume RTTI is enabled.
+#  define GTEST_HAS_RTTI 1
+
+# endif  // _MSC_VER
+
+#endif  // GTEST_HAS_RTTI
+
+// It's this header's responsibility to #include <typeinfo> when RTTI
+// is enabled.
+#if GTEST_HAS_RTTI
+# include <typeinfo>
+#endif
+
+// Determines whether Google Test can use the pthreads library.
+#ifndef GTEST_HAS_PTHREAD
+// The user didn't tell us explicitly, so we make reasonable assumptions about
+// which platforms have pthreads support.
+//
+// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
+// to your compiler flags.
+# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
+    || GTEST_OS_QNX || GTEST_OS_FREEBSD || GTEST_OS_NACL)
+#endif  // GTEST_HAS_PTHREAD
+
+#if GTEST_HAS_PTHREAD
+// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
+// true.
+# include <pthread.h>  // NOLINT
+
+// For timespec and nanosleep, used below.
+# include <time.h>  // NOLINT
+#endif
+
+// Determines if hash_map/hash_set are available.
+// Only used for testing against those containers.
+#if !defined(GTEST_HAS_HASH_MAP_)
+# if _MSC_VER
+#  define GTEST_HAS_HASH_MAP_ 1  // Indicates that hash_map is available.
+#  define GTEST_HAS_HASH_SET_ 1  // Indicates that hash_set is available.
+# endif  // _MSC_VER
+#endif  // !defined(GTEST_HAS_HASH_MAP_)
+
+// Determines whether Google Test can use tr1/tuple.  You can define
+// this macro to 0 to prevent Google Test from using tuple (any
+// feature depending on tuple with be disabled in this mode).
+#ifndef GTEST_HAS_TR1_TUPLE
+# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
+// STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
+#  define GTEST_HAS_TR1_TUPLE 0
+# else
+// The user didn't tell us not to do it, so we assume it's OK.
+#  define GTEST_HAS_TR1_TUPLE 1
+# endif
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether Google Test's own tr1 tuple implementation
+// should be used.
+#ifndef GTEST_USE_OWN_TR1_TUPLE
+// The user didn't tell us, so we need to figure it out.
+
+// We use our own TR1 tuple if we aren't sure the user has an
+// implementation of it already.  At this time, libstdc++ 4.0.0+ and
+// MSVC 2010 are the only mainstream standard libraries that come
+// with a TR1 tuple implementation.  NVIDIA's CUDA NVCC compiler
+// pretends to be GCC by defining __GNUC__ and friends, but cannot
+// compile GCC's tuple implementation.  MSVC 2008 (9.0) provides TR1
+// tuple in a 323 MB Feature Pack download, which we cannot assume the
+// user has.  QNX's QCC compiler is a modified GCC but it doesn't
+// support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
+// and it can be used with some compilers that define __GNUC__.
+# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
+      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
+#  define GTEST_ENV_HAS_TR1_TUPLE_ 1
+# endif
+
+// C++11 specifies that <tuple> provides std::tuple. Use that if gtest is used
+// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6
+// can build with clang but need to use gcc4.2's libstdc++).
+# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325)
+#  define GTEST_ENV_HAS_STD_TUPLE_ 1
+# endif
+
+# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_
+#  define GTEST_USE_OWN_TR1_TUPLE 0
+# else
+#  define GTEST_USE_OWN_TR1_TUPLE 1
+# endif
+
+#endif  // GTEST_USE_OWN_TR1_TUPLE
+
+// To avoid conditional compilation everywhere, we make it
+// gtest-port.h's responsibility to #include the header implementing
+// tuple.
+#if GTEST_HAS_STD_TUPLE_
+# include <tuple>  // IWYU pragma: export
+# define GTEST_TUPLE_NAMESPACE_ ::std
+#endif  // GTEST_HAS_STD_TUPLE_
+
+// We include tr1::tuple even if std::tuple is available to define printers for
+// them.
+#if GTEST_HAS_TR1_TUPLE
+# ifndef GTEST_TUPLE_NAMESPACE_
+#  define GTEST_TUPLE_NAMESPACE_ ::std::tr1
+# endif  // GTEST_TUPLE_NAMESPACE_
+
+# if GTEST_USE_OWN_TR1_TUPLE
+#  include "gtest/internal/gtest-tuple.h"  // IWYU pragma: export  // NOLINT
+# elif GTEST_ENV_HAS_STD_TUPLE_
+#  include <tuple>
+// C++11 puts its tuple into the ::std namespace rather than
+// ::std::tr1.  gtest expects tuple to live in ::std::tr1, so put it there.
+// This causes undefined behavior, but supported compilers react in
+// the way we intend.
+namespace std {
+namespace tr1 {
+using ::std::get;
+using ::std::make_tuple;
+using ::std::tuple;
+using ::std::tuple_element;
+using ::std::tuple_size;
+}
+}
+
+# elif GTEST_OS_SYMBIAN
+
+// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
+// use STLport's tuple implementation, which unfortunately doesn't
+// work as the copy of STLport distributed with Symbian is incomplete.
+// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to
+// use its own tuple implementation.
+#  ifdef BOOST_HAS_TR1_TUPLE
+#   undef BOOST_HAS_TR1_TUPLE
+#  endif  // BOOST_HAS_TR1_TUPLE
+
+// This prevents <boost/tr1/detail/config.hpp>, which defines
+// BOOST_HAS_TR1_TUPLE, from being #included by Boost's <tuple>.
+#  define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED
+#  include <tuple>  // IWYU pragma: export  // NOLINT
+
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000)
+// GCC 4.0+ implements tr1/tuple in the <tr1/tuple> header.  This does
+// not conform to the TR1 spec, which requires the header to be <tuple>.
+
+#  if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+// Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
+// which is #included by <tr1/tuple>, to not compile when RTTI is
+// disabled.  _TR1_FUNCTIONAL is the header guard for
+// <tr1/functional>.  Hence the following #define is a hack to prevent
+// <tr1/functional> from being included.
+#   define _TR1_FUNCTIONAL 1
+#   include <tr1/tuple>
+#   undef _TR1_FUNCTIONAL  // Allows the user to #include
+                        // <tr1/functional> if he chooses to.
+#  else
+#   include <tr1/tuple>  // NOLINT
+#  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+
+# else
+// If the compiler is not GCC 4.0+, we assume the user is using a
+// spec-conforming TR1 implementation.
+#  include <tuple>  // IWYU pragma: export  // NOLINT
+# endif  // GTEST_USE_OWN_TR1_TUPLE
+
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether clone(2) is supported.
+// Usually it will only be available on Linux, excluding
+// Linux on the Itanium architecture.
+// Also see http://linux.die.net/man/2/clone.
+#ifndef GTEST_HAS_CLONE
+// The user didn't tell us, so we need to figure it out.
+
+# if GTEST_OS_LINUX && !defined(__ia64__)
+#  if GTEST_OS_LINUX_ANDROID
+// On Android, clone() is only available on ARM starting with Gingerbread.
+#    if defined(__arm__) && __ANDROID_API__ >= 9
+#     define GTEST_HAS_CLONE 1
+#    else
+#     define GTEST_HAS_CLONE 0
+#    endif
+#  else
+#   define GTEST_HAS_CLONE 1
+#  endif
+# else
+#  define GTEST_HAS_CLONE 0
+# endif  // GTEST_OS_LINUX && !defined(__ia64__)
+
+#endif  // GTEST_HAS_CLONE
+
+// Determines whether to support stream redirection. This is used to test
+// output correctness and to implement death tests.
+#ifndef GTEST_HAS_STREAM_REDIRECTION
+// By default, we assume that stream redirection is supported on all
+// platforms except known mobile ones.
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || \
+    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+#  define GTEST_HAS_STREAM_REDIRECTION 0
+# else
+#  define GTEST_HAS_STREAM_REDIRECTION 1
+# endif  // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Determines whether to support death tests.
+// Google Test does not support death tests for VC 7.1 and earlier as
+// abort() in a VC 7.1 application compiled as GUI in debug config
+// pops up a dialog window that cannot be suppressed programmatically.
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) || \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
+     GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
+     GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD)
+# define GTEST_HAS_DEATH_TEST 1
+#endif
+
+// We don't support MSVC 7.1 with exceptions disabled now.  Therefore
+// all the compilers we care about are adequate for supporting
+// value-parameterized tests.
+#define GTEST_HAS_PARAM_TEST 1
+
+// Determines whether to support type-driven tests.
+
+// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
+// Sun Pro CC, IBM Visual Age, and HP aCC support.
+#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \
+    defined(__IBMCPP__) || defined(__HP_aCC)
+# define GTEST_HAS_TYPED_TEST 1
+# define GTEST_HAS_TYPED_TEST_P 1
+#endif
+
+// Determines whether to support Combine(). This only makes sense when
+// value-parameterized tests are enabled.  The implementation doesn't
+// work on Sun Studio since it doesn't understand templated conversion
+// operators.
+#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC)
+# define GTEST_HAS_COMBINE 1
+#endif
+
+// Determines whether the system compiler uses UTF-16 for encoding wide strings.
+#define GTEST_WIDE_STRING_USES_UTF16_ \
+    (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX)
+
+// Determines whether test results can be streamed to a socket.
+#if GTEST_OS_LINUX
+# define GTEST_CAN_STREAM_RESULTS_ 1
+#endif
+
+// Defines some utility macros.
+
+// The GNU compiler emits a warning if nested "if" statements are followed by
+// an "else" statement and braces are not used to explicitly disambiguate the
+// "else" binding.  This leads to problems with code like:
+//
+//   if (gate)
+//     ASSERT_*(condition) << "Some message";
+//
+// The "switch (0) case 0:" idiom is used to suppress this.
+#ifdef __INTEL_COMPILER
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#else
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
+#endif
+
+// Use this annotation at the end of a struct/class definition to
+// prevent the compiler from optimizing away instances that are never
+// used.  This is useful when all interesting logic happens inside the
+// c'tor and / or d'tor.  Example:
+//
+//   struct Foo {
+//     Foo() { ... }
+//   } GTEST_ATTRIBUTE_UNUSED_;
+//
+// Also use it after a variable or parameter declaration to tell the
+// compiler the variable/parameter does not have to be used.
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#elif defined(__clang__)
+# if __has_attribute(unused)
+#  define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+# endif
+#endif
+#ifndef GTEST_ATTRIBUTE_UNUSED_
+# define GTEST_ATTRIBUTE_UNUSED_
+#endif
+
+// A macro to disallow operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_ASSIGN_(type)\
+  void operator=(type const &)
+
+// A macro to disallow copy constructor and operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\
+  type(type const &);\
+  GTEST_DISALLOW_ASSIGN_(type)
+
+// Tell the compiler to warn about unused return values for functions declared
+// with this macro.  The macro should be used on function declarations
+// following the argument list:
+//
+//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
+#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC)
+# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#else
+# define GTEST_MUST_USE_RESULT_
+#endif  // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC
+
+// MS C++ compiler emits warning when a conditional expression is compile time
+// constant. In some contexts this warning is false positive and needs to be
+// suppressed. Use the following two macros in such cases:
+//
+// GTEST_INTENTIONAL_CONST_COND_PUSH_()
+// while (true) {
+// GTEST_INTENTIONAL_CONST_COND_POP_()
+// }
+# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+# define GTEST_INTENTIONAL_CONST_COND_POP_() \
+    GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+// Determine whether the compiler supports Microsoft's Structured Exception
+// Handling.  This is supported by several Windows compilers but generally
+// does not exist on any other system.
+#ifndef GTEST_HAS_SEH
+// The user didn't tell us, so we need to figure it out.
+
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// These two compilers are known to support SEH.
+#  define GTEST_HAS_SEH 1
+# else
+// Assume no SEH.
+#  define GTEST_HAS_SEH 0
+# endif
+
+#define GTEST_IS_THREADSAFE \
+    (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ \
+     || (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) \
+     || GTEST_HAS_PTHREAD)
+
+#endif  // GTEST_HAS_SEH
+
+#ifdef _MSC_VER
+# if GTEST_LINKED_AS_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllimport)
+# elif GTEST_CREATE_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllexport)
+# endif
+#elif __GNUC__ >= 4 || defined(__clang__)
+# define GTEST_API_ __attribute__((visibility ("default")))
+#endif // _MSC_VER
+
+#ifndef GTEST_API_
+# define GTEST_API_
+#endif
+
+#ifdef __GNUC__
+// Ask the compiler to never inline a given function.
+# define GTEST_NO_INLINE_ __attribute__((noinline))
+#else
+# define GTEST_NO_INLINE_
+#endif
+
+// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
+#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
+# define GTEST_HAS_CXXABI_H_ 1
+#else
+# define GTEST_HAS_CXXABI_H_ 0
+#endif
+
+// A function level attribute to disable checking for use of uninitialized
+// memory when built with MemorySanitizer.
+#if defined(__clang__)
+# if __has_feature(memory_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \
+       __attribute__((no_sanitize_memory))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+# endif  // __has_feature(memory_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __clang__
+
+// A function level attribute to disable AddressSanitizer instrumentation.
+#if defined(__clang__)
+# if __has_feature(address_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+       __attribute__((no_sanitize_address))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+# endif  // __has_feature(address_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __clang__
+
+// A function level attribute to disable ThreadSanitizer instrumentation.
+#if defined(__clang__)
+# if __has_feature(thread_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \
+       __attribute__((no_sanitize_thread))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+# endif  // __has_feature(thread_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __clang__
+
+// A function level attribute to disable UndefinedBehaviorSanitizer's (defined)
+// unsigned integer overflow instrumentation.
+#if defined(__clang__)
+# if defined(__has_attribute) && __has_attribute(no_sanitize)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ \
+       __attribute__((no_sanitize("unsigned-integer-overflow")))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
+# endif  // defined(__has_attribute) && __has_attribute(no_sanitize)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
+#endif  // __clang__
+
+namespace testing {
+
+class Message;
+
+#if defined(GTEST_TUPLE_NAMESPACE_)
+// Import tuple and friends into the ::testing namespace.
+// It is part of our interface, having them in ::testing allows us to change
+// their types as needed.
+using GTEST_TUPLE_NAMESPACE_::get;
+using GTEST_TUPLE_NAMESPACE_::make_tuple;
+using GTEST_TUPLE_NAMESPACE_::tuple;
+using GTEST_TUPLE_NAMESPACE_::tuple_size;
+using GTEST_TUPLE_NAMESPACE_::tuple_element;
+#endif  // defined(GTEST_TUPLE_NAMESPACE_)
+
+namespace internal {
+
+// A secret type that Google Test users don't know about.  It has no
+// definition on purpose.  Therefore it's impossible to create a
+// Secret object, which is what we want.
+class Secret;
+
+// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time
+// expression is true. For example, you could use it to verify the
+// size of a static array:
+//
+//   GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES,
+//                         names_incorrect_size);
+//
+// or to make sure a struct is smaller than a certain size:
+//
+//   GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large);
+//
+// The second argument to the macro is the name of the variable. If
+// the expression is false, most compilers will issue a warning/error
+// containing the name of the variable.
+
+#if GTEST_LANG_CXX11
+# define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
+#else  // !GTEST_LANG_CXX11
+template <bool>
+  struct CompileAssert {
+};
+
+# define GTEST_COMPILE_ASSERT_(expr, msg) \
+  typedef ::testing::internal::CompileAssert<(static_cast<bool>(expr))> \
+      msg[static_cast<bool>(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_
+#endif  // !GTEST_LANG_CXX11
+
+// Implementation details of GTEST_COMPILE_ASSERT_:
+//
+// (In C++11, we simply use static_assert instead of the following)
+//
+// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1
+//   elements (and thus is invalid) when the expression is false.
+//
+// - The simpler definition
+//
+//    #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1]
+//
+//   does not work, as gcc supports variable-length arrays whose sizes
+//   are determined at run-time (this is gcc's extension and not part
+//   of the C++ standard).  As a result, gcc fails to reject the
+//   following code with the simple definition:
+//
+//     int foo;
+//     GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is
+//                                      // not a compile-time constant.
+//
+// - By using the type CompileAssert<(bool(expr))>, we ensures that
+//   expr is a compile-time constant.  (Template arguments must be
+//   determined at compile-time.)
+//
+// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
+//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
+//
+//     CompileAssert<bool(expr)>
+//
+//   instead, these compilers will refuse to compile
+//
+//     GTEST_COMPILE_ASSERT_(5 > 0, some_message);
+//
+//   (They seem to think the ">" in "5 > 0" marks the end of the
+//   template argument list.)
+//
+// - The array size is (bool(expr) ? 1 : -1), instead of simply
+//
+//     ((expr) ? 1 : -1).
+//
+//   This is to avoid running into a bug in MS VC 7.1, which
+//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
+
+// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h.
+//
+// This template is declared, but intentionally undefined.
+template <typename T1, typename T2>
+struct StaticAssertTypeEqHelper;
+
+template <typename T>
+struct StaticAssertTypeEqHelper<T, T> {
+  enum { value = true };
+};
+
+// Evaluates to the number of elements in 'array'.
+#define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0]))
+
+#if GTEST_HAS_GLOBAL_STRING
+typedef ::string string;
+#else
+typedef ::std::string string;
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+typedef ::wstring wstring;
+#elif GTEST_HAS_STD_WSTRING
+typedef ::std::wstring wstring;
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// A helper for suppressing warnings on constant condition.  It just
+// returns 'condition'.
+GTEST_API_ bool IsTrue(bool condition);
+
+// Defines scoped_ptr.
+
+// This implementation of scoped_ptr is PARTIAL - it only contains
+// enough stuff to satisfy Google Test's need.
+template <typename T>
+class scoped_ptr {
+ public:
+  typedef T element_type;
+
+  explicit scoped_ptr(T* p = NULL) : ptr_(p) {}
+  ~scoped_ptr() { reset(); }
+
+  T& operator*() const { return *ptr_; }
+  T* operator->() const { return ptr_; }
+  T* get() const { return ptr_; }
+
+  T* release() {
+    T* const ptr = ptr_;
+    ptr_ = NULL;
+    return ptr;
+  }
+
+  void reset(T* p = NULL) {
+    if (p != ptr_) {
+      if (IsTrue(sizeof(T) > 0)) {  // Makes sure T is a complete type.
+        delete ptr_;
+      }
+      ptr_ = p;
+    }
+  }
+
+  friend void swap(scoped_ptr& a, scoped_ptr& b) {
+    using std::swap;
+    swap(a.ptr_, b.ptr_);
+  }
+
+ private:
+  T* ptr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr);
+};
+
+// Defines RE.
+
+// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
+// Regular Expression syntax.
+class GTEST_API_ RE {
+ public:
+  // A copy constructor is required by the Standard to initialize object
+  // references from r-values.
+  RE(const RE& other) { Init(other.pattern()); }
+
+  // Constructs an RE from a string.
+  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+#if GTEST_HAS_GLOBAL_STRING
+
+  RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+  RE(const char* regex) { Init(regex); }  // NOLINT
+  ~RE();
+
+  // Returns the string representation of the regex.
+  const char* pattern() const { return pattern_; }
+
+  // FullMatch(str, re) returns true iff regular expression re matches
+  // the entire str.
+  // PartialMatch(str, re) returns true iff regular expression re
+  // matches a substring of str (including str itself).
+  //
+  // TODO(wan@google.com): make FullMatch() and PartialMatch() work
+  // when str contains NUL characters.
+  static bool FullMatch(const ::std::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::std::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+#if GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const ::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const char* str, const RE& re);
+  static bool PartialMatch(const char* str, const RE& re);
+
+ private:
+  void Init(const char* regex);
+
+  // We use a const char* instead of an std::string, as Google Test used to be
+  // used where std::string is not available.  TODO(wan@google.com): change to
+  // std::string.
+  const char* pattern_;
+  bool is_valid_;
+
+#if GTEST_USES_POSIX_RE
+
+  regex_t full_regex_;     // For FullMatch().
+  regex_t partial_regex_;  // For PartialMatch().
+
+#else  // GTEST_USES_SIMPLE_RE
+
+  const char* full_pattern_;  // For FullMatch();
+
+#endif
+
+  GTEST_DISALLOW_ASSIGN_(RE);
+};
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line);
+
+// Defines logging utilities:
+//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
+//                          message itself is streamed into the macro.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+
+enum GTestLogSeverity {
+  GTEST_INFO,
+  GTEST_WARNING,
+  GTEST_ERROR,
+  GTEST_FATAL
+};
+
+// Formats log entry severity, provides a stream object for streaming the
+// log message, and terminates the message with a newline when going out of
+// scope.
+class GTEST_API_ GTestLog {
+ public:
+  GTestLog(GTestLogSeverity severity, const char* file, int line);
+
+  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+  ~GTestLog();
+
+  ::std::ostream& GetStream() { return ::std::cerr; }
+
+ private:
+  const GTestLogSeverity severity_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+};
+
+#if !defined(GTEST_LOG_)
+
+# define GTEST_LOG_(severity) \
+    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                  __FILE__, __LINE__).GetStream()
+
+inline void LogToStderr() {}
+inline void FlushInfoLog() { fflush(NULL); }
+
+#endif  // !defined(GTEST_LOG_)
+
+#if !defined(GTEST_CHECK_)
+// INTERNAL IMPLEMENTATION - DO NOT USE.
+//
+// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
+// is not satisfied.
+//  Synopsys:
+//    GTEST_CHECK_(boolean_condition);
+//     or
+//    GTEST_CHECK_(boolean_condition) << "Additional message";
+//
+//    This checks the condition and if the condition is not satisfied
+//    it prints message about the condition violation, including the
+//    condition itself, plus additional message streamed into it, if any,
+//    and then it aborts the program. It aborts the program irrespective of
+//    whether it is built in the debug mode or not.
+# define GTEST_CHECK_(condition) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::IsTrue(condition)) \
+      ; \
+    else \
+      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#endif  // !defined(GTEST_CHECK_)
+
+// An all-mode assert to verify that the given POSIX-style function
+// call returns 0 (indicating success).  Known limitation: this
+// doesn't expand to a balanced 'if' statement, so enclose the macro
+// in {} if you need to use it as the only statement in an 'if'
+// branch.
+#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
+  if (const int gtest_error = (posix_call)) \
+    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
+                      << gtest_error
+
+#if GTEST_HAS_STD_MOVE_
+using std::move;
+#else  // GTEST_HAS_STD_MOVE_
+template <typename T>
+const T& move(const T& t) {
+  return t;
+}
+#endif  // GTEST_HAS_STD_MOVE_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Use ImplicitCast_ as a safe version of static_cast for upcasting in
+// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
+// const Foo*).  When you use ImplicitCast_, the compiler checks that
+// the cast is safe.  Such explicit ImplicitCast_s are necessary in
+// surprisingly many situations where C++ demands an exact type match
+// instead of an argument type convertable to a target type.
+//
+// The syntax for using ImplicitCast_ is the same as for static_cast:
+//
+//   ImplicitCast_<ToType>(expr)
+//
+// ImplicitCast_ would have been part of the C++ standard library,
+// but the proposal was submitted too late.  It will probably make
+// its way into the language in the future.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., implicit_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To>
+inline To ImplicitCast_(To x) { return x; }
+
+// When you upcast (that is, cast a pointer from type Foo to type
+// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
+// always succeed.  When you downcast (that is, cast a pointer from
+// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
+// how do you know the pointer is really of type SubclassOfFoo?  It
+// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
+// when you downcast, you should use this macro.  In debug mode, we
+// use dynamic_cast<> to double-check the downcast is legal (we die
+// if it's not).  In normal mode, we do the efficient static_cast<>
+// instead.  Thus, it's important to test in debug mode to make sure
+// the cast is legal!
+//    This is the only place in the code we should use dynamic_cast<>.
+// In particular, you SHOULDN'T be using dynamic_cast<> in order to
+// do RTTI (eg code like this:
+//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
+//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
+// You should design the code some other way not to need this.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., down_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {  // so we only accept pointers
+  // Ensures that To is a sub-type of From *.  This test is here only
+  // for compile-time type checking, and has no overhead in an
+  // optimized build at run-time, as it will be optimized away
+  // completely.
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (false) {
+  GTEST_INTENTIONAL_CONST_COND_POP_()
+    const To to = NULL;
+    ::testing::internal::ImplicitCast_<From*>(to);
+  }
+
+#if GTEST_HAS_RTTI
+  // RTTI: debug mode only!
+  GTEST_CHECK_(f == NULL || dynamic_cast<To>(f) != NULL);
+#endif
+  return static_cast<To>(f);
+}
+
+// Downcasts the pointer of type Base to Derived.
+// Derived must be a subclass of Base. The parameter MUST
+// point to a class of type Derived, not any subclass of it.
+// When RTTI is available, the function performs a runtime
+// check to enforce this.
+template <class Derived, class Base>
+Derived* CheckedDowncastToActualType(Base* base) {
+#if GTEST_HAS_RTTI
+  GTEST_CHECK_(typeid(*base) == typeid(Derived));
+#endif
+
+#if GTEST_HAS_DOWNCAST_
+  return ::down_cast<Derived*>(base);
+#elif GTEST_HAS_RTTI
+  return dynamic_cast<Derived*>(base);  // NOLINT
+#else
+  return static_cast<Derived*>(base);  // Poor man's downcast.
+#endif
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Defines the stderr capturer:
+//   CaptureStdout     - starts capturing stdout.
+//   GetCapturedStdout - stops capturing stdout and returns the captured string.
+//   CaptureStderr     - starts capturing stderr.
+//   GetCapturedStderr - stops capturing stderr and returns the captured string.
+//
+GTEST_API_ void CaptureStdout();
+GTEST_API_ std::string GetCapturedStdout();
+GTEST_API_ void CaptureStderr();
+GTEST_API_ std::string GetCapturedStderr();
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Returns a path to temporary directory.
+GTEST_API_ std::string TempDir();
+
+// Returns the size (in bytes) of a file.
+GTEST_API_ size_t GetFileSize(FILE* file);
+
+// Reads the entire content of a file as a string.
+GTEST_API_ std::string ReadEntireFile(FILE* file);
+
+// All command line arguments.
+GTEST_API_ const ::std::vector<testing::internal::string>& GetArgvs();
+
+#if GTEST_HAS_DEATH_TEST
+
+const ::std::vector<testing::internal::string>& GetInjectableArgvs();
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>*
+                             new_argvs);
+
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Defines synchronization primitives.
+#if GTEST_IS_THREADSAFE
+# if GTEST_HAS_PTHREAD
+// Sleeps for (roughly) n milliseconds.  This function is only for testing
+// Google Test's own constructs.  Don't use it in user tests, either
+// directly or indirectly.
+inline void SleepMilliseconds(int n) {
+  const timespec time = {
+    0,                  // 0 seconds.
+    n * 1000L * 1000L,  // And n ms.
+  };
+  nanosleep(&time, NULL);
+}
+# endif  // GTEST_HAS_PTHREAD
+
+# if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
+
+# elif GTEST_HAS_PTHREAD
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class Notification {
+ public:
+  Notification() : notified_(false) {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+  }
+  ~Notification() {
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    pthread_mutex_lock(&mutex_);
+    notified_ = true;
+    pthread_mutex_unlock(&mutex_);
+  }
+
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    for (;;) {
+      pthread_mutex_lock(&mutex_);
+      const bool notified = notified_;
+      pthread_mutex_unlock(&mutex_);
+      if (notified)
+        break;
+      SleepMilliseconds(10);
+    }
+  }
+
+ private:
+  pthread_mutex_t mutex_;
+  bool notified_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+
+# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+GTEST_API_ void SleepMilliseconds(int n);
+
+// Provides leak-safe Windows kernel handle ownership.
+// Used in death tests and in threading support.
+class GTEST_API_ AutoHandle {
+ public:
+  // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to
+  // avoid including <windows.h> in this header file. Including <windows.h> is
+  // undesirable because it defines a lot of symbols and macros that tend to
+  // conflict with client code. This assumption is verified by
+  // WindowsTypesTest.HANDLEIsVoidStar.
+  typedef void* Handle;
+  AutoHandle();
+  explicit AutoHandle(Handle handle);
+
+  ~AutoHandle();
+
+  Handle Get() const;
+  void Reset();
+  void Reset(Handle handle);
+
+ private:
+  // Returns true iff the handle is a valid handle object that can be closed.
+  bool IsCloseable() const;
+
+  Handle handle_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+};
+
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class GTEST_API_ Notification {
+ public:
+  Notification();
+  void Notify();
+  void WaitForNotification();
+
+ private:
+  AutoHandle event_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+# endif  // GTEST_HAS_NOTIFICATION_
+
+// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
+// defined, but we don't want to use MinGW's pthreads implementation, which
+// has conformance problems with some versions of the POSIX standard.
+# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+
+// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
+// Consequently, it cannot select a correct instantiation of ThreadWithParam
+// in order to call its Run(). Introducing ThreadWithParamBase as a
+// non-templated base class for ThreadWithParam allows us to bypass this
+// problem.
+class ThreadWithParamBase {
+ public:
+  virtual ~ThreadWithParamBase() {}
+  virtual void Run() = 0;
+};
+
+// pthread_create() accepts a pointer to a function type with the C linkage.
+// According to the Standard (7.5/1), function types with different linkages
+// are different even if they are otherwise identical.  Some compilers (for
+// example, SunStudio) treat them as different types.  Since class methods
+// cannot be defined with C-linkage we need to define a free C-function to
+// pass into pthread_create().
+extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
+  static_cast<ThreadWithParamBase*>(thread)->Run();
+  return NULL;
+}
+
+// Helper class for testing Google Test's multi-threading constructs.
+// To use it, write:
+//
+//   void ThreadFunc(int param) { /* Do things with param */ }
+//   Notification thread_can_start;
+//   ...
+//   // The thread_can_start parameter is optional; you can supply NULL.
+//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
+//   thread_can_start.Notify();
+//
+// These classes are only for testing Google Test's own constructs. Do
+// not use them in user tests, either directly or indirectly.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : func_(func),
+        param_(param),
+        thread_can_start_(thread_can_start),
+        finished_(false) {
+    ThreadWithParamBase* const base = this;
+    // The thread can be created only after all fields except thread_
+    // have been initialized.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base));
+  }
+  ~ThreadWithParam() { Join(); }
+
+  void Join() {
+    if (!finished_) {
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0));
+      finished_ = true;
+    }
+  }
+
+  virtual void Run() {
+    if (thread_can_start_ != NULL)
+      thread_can_start_->WaitForNotification();
+    func_(param_);
+  }
+
+ private:
+  UserThreadFunc* const func_;  // User-supplied thread function.
+  const T param_;  // User-supplied parameter to the thread function.
+  // When non-NULL, used to block execution until the controller thread
+  // notifies.
+  Notification* const thread_can_start_;
+  bool finished_;  // true iff we know that the thread function has finished.
+  pthread_t thread_;  // The native thread object.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+# endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+         // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+// Mutex and ThreadLocal have already been imported into the namespace.
+// Nothing to do here.
+
+# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+// Mutex implements mutex on Windows platforms.  It is used in conjunction
+// with class MutexLock:
+//
+//   Mutex mutex;
+//   ...
+//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the
+//                            // end of the current scope.
+//
+// A static Mutex *must* be defined or declared using one of the following
+// macros:
+//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
+//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
+//
+// (A non-static Mutex is defined/declared in the usual way).
+class GTEST_API_ Mutex {
+ public:
+  enum MutexType { kStatic = 0, kDynamic = 1 };
+  // We rely on kStaticMutex being 0 as it is to what the linker initializes
+  // type_ in static mutexes.  critical_section_ will be initialized lazily
+  // in ThreadSafeLazyInit().
+  enum StaticConstructorSelector { kStaticMutex = 0 };
+
+  // This constructor intentionally does nothing.  It relies on type_ being
+  // statically initialized to 0 (effectively setting it to kStatic) and on
+  // ThreadSafeLazyInit() to lazily initialize the rest of the members.
+  explicit Mutex(StaticConstructorSelector /*dummy*/) {}
+
+  Mutex();
+  ~Mutex();
+
+  void Lock();
+
+  void Unlock();
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld();
+
+ private:
+  // Initializes owner_thread_id_ and critical_section_ in static mutexes.
+  void ThreadSafeLazyInit();
+
+  // Per http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx,
+  // we assume that 0 is an invalid value for thread IDs.
+  unsigned int owner_thread_id_;
+
+  // For static mutexes, we rely on these members being initialized to zeros
+  // by the linker.
+  MutexType type_;
+  long critical_section_init_phase_;  // NOLINT
+  _RTL_CRITICAL_SECTION* critical_section_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+    extern ::testing::internal::Mutex mutex
+
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+    ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  Mutex* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Base class for ValueHolder<T>.  Allows a caller to hold and delete a value
+// without knowing its type.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Provides a way for a thread to send notifications to a ThreadLocal
+// regardless of its parameter type.
+class ThreadLocalBase {
+ public:
+  // Creates a new ValueHolder<T> object holding a default value passed to
+  // this ThreadLocal<T>'s constructor and returns it.  It is the caller's
+  // responsibility not to call this when the ThreadLocal<T> instance already
+  // has a value on the current thread.
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0;
+
+ protected:
+  ThreadLocalBase() {}
+  virtual ~ThreadLocalBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase);
+};
+
+// Maps a thread to a set of ThreadLocals that have values instantiated on that
+// thread and notifies them when the thread exits.  A ThreadLocal instance is
+// expected to persist until all threads it has values on have terminated.
+class GTEST_API_ ThreadLocalRegistry {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance);
+
+  // Invoked when a ThreadLocal instance is destroyed.
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance);
+};
+
+class GTEST_API_ ThreadWithParamBase {
+ public:
+  void Join();
+
+ protected:
+  class Runnable {
+   public:
+    virtual ~Runnable() {}
+    virtual void Run() = 0;
+  };
+
+  ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start);
+  virtual ~ThreadWithParamBase();
+
+ private:
+  AutoHandle thread_;
+};
+
+// Helper class for testing Google Test's multi-threading constructs.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {
+  }
+  virtual ~ThreadWithParam() {}
+
+ private:
+  class RunnableImpl : public Runnable {
+   public:
+    RunnableImpl(UserThreadFunc* func, T param)
+        : func_(func),
+          param_(param) {
+    }
+    virtual ~RunnableImpl() {}
+    virtual void Run() {
+      func_(param_);
+    }
+
+   private:
+    UserThreadFunc* const func_;
+    const T param_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
+  };
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+
+// Implements thread-local storage on Windows systems.
+//
+//   // Thread 1
+//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
+//
+//   // Thread 2
+//   tl.set(150);  // Changes the value for thread 2 only.
+//   EXPECT_EQ(150, tl.get());
+//
+//   // Thread 1
+//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
+//   tl.set(200);
+//   EXPECT_EQ(200, tl.get());
+//
+// The template type argument T must have a public copy constructor.
+// In addition, the default ThreadLocal constructor requires T to have
+// a public default constructor.
+//
+// The users of a TheadLocal instance have to make sure that all but one
+// threads (including the main one) using that instance have exited before
+// destroying it. Otherwise, the per-thread objects managed for them by the
+// ThreadLocal instance are not guaranteed to be destroyed on all platforms.
+//
+// Google Test only uses global ThreadLocal objects.  That means they
+// will die after main() has returned.  Therefore, no per-thread
+// object managed by Google Test will be leaked as long as all threads
+// using Google Test have exited when main() returns.
+template <typename T>
+class ThreadLocal : public ThreadLocalBase {
+ public:
+  ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : default_factory_(new InstanceValueHolderFactory(value)) {}
+
+  ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of T.  Can be deleted via its base class without the caller
+  // knowing the type of T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    ValueHolder() : value_() {}
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+
+  T* GetOrCreateValue() const {
+    return static_cast<ValueHolder*>(
+        ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer();
+  }
+
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
+    return default_factory_->MakeNewHolder();
+  }
+
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    virtual ValueHolder* MakeNewHolder() const {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
+  scoped_ptr<ValueHolderFactory> default_factory_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+# elif GTEST_HAS_PTHREAD
+
+// MutexBase and Mutex implement mutex on pthreads-based platforms.
+class MutexBase {
+ public:
+  // Acquires this mutex.
+  void Lock() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
+    owner_ = pthread_self();
+    has_owner_ = true;
+  }
+
+  // Releases this mutex.
+  void Unlock() {
+    // Since the lock is being released the owner_ field should no longer be
+    // considered valid. We don't protect writing to has_owner_ here, as it's
+    // the caller's responsibility to ensure that the current thread holds the
+    // mutex when this is called.
+    has_owner_ = false;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
+  }
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld() const {
+    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
+        << "The current thread is not holding the mutex @" << this;
+  }
+
+  // A static mutex may be used before main() is entered.  It may even
+  // be used before the dynamic initialization stage.  Therefore we
+  // must be able to initialize a static mutex object at link time.
+  // This means MutexBase has to be a POD and its member variables
+  // have to be public.
+ public:
+  pthread_mutex_t mutex_;  // The underlying pthread mutex.
+  // has_owner_ indicates whether the owner_ field below contains a valid thread
+  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
+  // accesses to the owner_ field should be protected by a check of this field.
+  // An alternative might be to memset() owner_ to all zeros, but there's no
+  // guarantee that a zero'd pthread_t is necessarily invalid or even different
+  // from pthread_self().
+  bool has_owner_;
+  pthread_t owner_;  // The thread holding the mutex.
+};
+
+// Forward-declares a static mutex.
+#  define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+     extern ::testing::internal::MutexBase mutex
+
+// Defines and statically (i.e. at link time) initializes a static mutex.
+#  define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+     ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false, pthread_t() }
+
+// The Mutex class can only be used for mutexes created at runtime. It
+// shares its API with MutexBase otherwise.
+class Mutex : public MutexBase {
+ public:
+  Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+    has_owner_ = false;
+  }
+  ~Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(MutexBase* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  MutexBase* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Helpers for ThreadLocal.
+
+// pthread_key_create() requires DeleteThreadLocalValue() to have
+// C-linkage.  Therefore it cannot be templatized to access
+// ThreadLocal<T>.  Hence the need for class
+// ThreadLocalValueHolderBase.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Called by pthread to delete thread-local data stored by
+// pthread_setspecific().
+extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
+  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
+}
+
+// Implements thread-local storage on pthreads-based systems.
+template <typename T>
+class ThreadLocal {
+ public:
+  ThreadLocal()
+      : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : key_(CreateKey()),
+        default_factory_(new InstanceValueHolderFactory(value)) {}
+
+  ~ThreadLocal() {
+    // Destroys the managed object for the current thread, if any.
+    DeleteThreadLocalValue(pthread_getspecific(key_));
+
+    // Releases resources associated with the key.  This will *not*
+    // delete managed objects for other threads.
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
+  }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of type T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    ValueHolder() : value_() {}
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+  static pthread_key_t CreateKey() {
+    pthread_key_t key;
+    // When a thread exits, DeleteThreadLocalValue() will be called on
+    // the object managed for that thread.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_key_create(&key, &DeleteThreadLocalValue));
+    return key;
+  }
+
+  T* GetOrCreateValue() const {
+    ThreadLocalValueHolderBase* const holder =
+        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
+    if (holder != NULL) {
+      return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
+    }
+
+    ValueHolder* const new_holder = default_factory_->MakeNewHolder();
+    ThreadLocalValueHolderBase* const holder_base = new_holder;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
+    return new_holder->pointer();
+  }
+
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    virtual ValueHolder* MakeNewHolder() const {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
+  // A key pthreads uses for looking up per-thread values.
+  const pthread_key_t key_;
+  scoped_ptr<ValueHolderFactory> default_factory_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+# endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+#else  // GTEST_IS_THREADSAFE
+
+// A dummy implementation of synchronization primitives (mutex, lock,
+// and thread-local variable).  Necessary for compiling Google Test where
+// mutex is not supported - using Google Test in multiple threads is not
+// supported on such platforms.
+
+class Mutex {
+ public:
+  Mutex() {}
+  void Lock() {}
+  void Unlock() {}
+  void AssertHeld() const {}
+};
+
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
+
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex*) {}  // NOLINT
+};
+
+typedef GTestMutexLock MutexLock;
+
+template <typename T>
+class ThreadLocal {
+ public:
+  ThreadLocal() : value_() {}
+  explicit ThreadLocal(const T& value) : value_(value) {}
+  T* pointer() { return &value_; }
+  const T* pointer() const { return &value_; }
+  const T& get() const { return value_; }
+  void set(const T& value) { value_ = value; }
+ private:
+  T value_;
+};
+
+#endif  // GTEST_IS_THREADSAFE
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+GTEST_API_ size_t GetThreadCount();
+
+// Passing non-POD classes through ellipsis (...) crashes the ARM
+// compiler and generates a warning in Sun Studio.  The Nokia Symbian
+// and the IBM XL C/C++ compiler try to instantiate a copy constructor
+// for objects passed through ellipsis (...), failing for uncopyable
+// objects.  We define this to ensure that only POD is passed through
+// ellipsis on these systems.
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC)
+// We lose support for NULL detection where the compiler doesn't like
+// passing non-POD classes through ellipsis (...).
+# define GTEST_ELLIPSIS_NEEDS_POD_ 1
+#else
+# define GTEST_CAN_COMPARE_NULL 1
+#endif
+
+// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between
+// const T& and const T* in a function template.  These compilers
+// _can_ decide between class template specializations for T and T*,
+// so a tr1::type_traits-like is_pointer works.
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__)
+# define GTEST_NEEDS_IS_POINTER_ 1
+#endif
+
+template <bool bool_value>
+struct bool_constant {
+  typedef bool_constant<bool_value> type;
+  static const bool value = bool_value;
+};
+template <bool bool_value> const bool bool_constant<bool_value>::value;
+
+typedef bool_constant<false> false_type;
+typedef bool_constant<true> true_type;
+
+template <typename T>
+struct is_pointer : public false_type {};
+
+template <typename T>
+struct is_pointer<T*> : public true_type {};
+
+template <typename Iterator>
+struct IteratorTraits {
+  typedef typename Iterator::value_type value_type;
+};
+
+template <typename T>
+struct IteratorTraits<T*> {
+  typedef T value_type;
+};
+
+template <typename T>
+struct IteratorTraits<const T*> {
+  typedef T value_type;
+};
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_SEP_ "\\"
+# define GTEST_HAS_ALT_PATH_SEP_ 1
+// The biggest signed integer type the compiler supports.
+typedef __int64 BiggestInt;
+#else
+# define GTEST_PATH_SEP_ "/"
+# define GTEST_HAS_ALT_PATH_SEP_ 0
+typedef long long BiggestInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+// Utilities for char.
+
+// isspace(int ch) and friends accept an unsigned char or EOF.  char
+// may be signed, depending on the compiler (or compiler flags).
+// Therefore we need to cast a char to unsigned char before calling
+// isspace(), etc.
+
+inline bool IsAlpha(char ch) {
+  return isalpha(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsAlNum(char ch) {
+  return isalnum(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsDigit(char ch) {
+  return isdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsLower(char ch) {
+  return islower(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsSpace(char ch) {
+  return isspace(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsUpper(char ch) {
+  return isupper(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(char ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(wchar_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+
+inline char ToLower(char ch) {
+  return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
+}
+inline char ToUpper(char ch) {
+  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
+}
+
+inline std::string StripTrailingSpaces(std::string str) {
+  std::string::iterator it = str.end();
+  while (it != str.begin() && IsSpace(*--it))
+    it = str.erase(it);
+  return str;
+}
+
+// The testing::internal::posix namespace holds wrappers for common
+// POSIX functions.  These wrappers hide the differences between
+// Windows/MSVC and POSIX systems.  Since some compilers define these
+// standard functions as macros, the wrapper cannot have the same name
+// as the wrapped function.
+
+namespace posix {
+
+// Functions with a different name on Windows.
+
+#if GTEST_OS_WINDOWS
+
+typedef struct _stat StatStruct;
+
+# ifdef __BORLANDC__
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+# else  // !__BORLANDC__
+#  if GTEST_OS_WINDOWS_MOBILE
+inline int IsATTY(int /* fd */) { return 0; }
+#  else
+inline int IsATTY(int fd) { return _isatty(fd); }
+#  endif  // GTEST_OS_WINDOWS_MOBILE
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return _stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return _strdup(src); }
+# endif  // __BORLANDC__
+
+# if GTEST_OS_WINDOWS_MOBILE
+inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
+// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
+// time and thus not defined there.
+# else
+inline int FileNo(FILE* file) { return _fileno(file); }
+inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
+inline int RmDir(const char* dir) { return _rmdir(dir); }
+inline bool IsDir(const StatStruct& st) {
+  return (_S_IFDIR & st.st_mode) != 0;
+}
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+#else
+
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return strcasecmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
+
+#endif  // GTEST_OS_WINDOWS
+
+// Functions deprecated by MSVC 8.0.
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996 /* deprecated function */)
+
+inline const char* StrNCpy(char* dest, const char* src, size_t n) {
+  return strncpy(dest, src, n);
+}
+
+// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
+// StrError() aren't needed on Windows CE at this time and thus not
+// defined there.
+
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+inline int ChDir(const char* dir) { return chdir(dir); }
+#endif
+inline FILE* FOpen(const char* path, const char* mode) {
+  return fopen(path, mode);
+}
+#if !GTEST_OS_WINDOWS_MOBILE
+inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
+  return freopen(path, mode, stream);
+}
+inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
+#endif
+inline int FClose(FILE* fp) { return fclose(fp); }
+#if !GTEST_OS_WINDOWS_MOBILE
+inline int Read(int fd, void* buf, unsigned int count) {
+  return static_cast<int>(read(fd, buf, count));
+}
+inline int Write(int fd, const void* buf, unsigned int count) {
+  return static_cast<int>(write(fd, buf, count));
+}
+inline int Close(int fd) { return close(fd); }
+inline const char* StrError(int errnum) { return strerror(errnum); }
+#endif
+inline const char* GetEnv(const char* name) {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE | GTEST_OS_WINDOWS_RT
+  // We are on Windows CE, which has no environment variables.
+  static_cast<void>(name);  // To prevent 'unused argument' warning.
+  return NULL;
+#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+  // Environment variables which we programmatically clear will be set to the
+  // empty string rather than unset (NULL).  Handle that case.
+  const char* const env = getenv(name);
+  return (env != NULL && env[0] != '\0') ? env : NULL;
+#else
+  return getenv(name);
+#endif
+}
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Windows CE has no C library. The abort() function is used in
+// several places in Google Test. This implementation provides a reasonable
+// imitation of standard behaviour.
+void Abort();
+#else
+inline void Abort() { abort(); }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+}  // namespace posix
+
+// MSVC "deprecates" snprintf and issues warnings wherever it is used.  In
+// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
+// MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
+// function in order to achieve that.  We use macro definition here because
+// snprintf is a variadic function.
+#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+// MSVC 2005 and above support variadic macros.
+# define GTEST_SNPRINTF_(buffer, size, format, ...) \
+     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#elif defined(_MSC_VER)
+// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't
+// complain about _snprintf.
+# define GTEST_SNPRINTF_ _snprintf
+#else
+# define GTEST_SNPRINTF_ snprintf
+#endif
+
+// The maximum number a BiggestInt can represent.  This definition
+// works no matter BiggestInt is represented in one's complement or
+// two's complement.
+//
+// We cannot rely on numeric_limits in STL, as __int64 and long long
+// are not part of standard C++ and numeric_limits doesn't need to be
+// defined for them.
+const BiggestInt kMaxBiggestInt =
+    ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1));
+
+// This template class serves as a compile-time function from size to
+// type.  It maps a size in bytes to a primitive type with that
+// size. e.g.
+//
+//   TypeWithSize<4>::UInt
+//
+// is typedef-ed to be unsigned int (unsigned integer made up of 4
+// bytes).
+//
+// Such functionality should belong to STL, but I cannot find it
+// there.
+//
+// Google Test uses this class in the implementation of floating-point
+// comparison.
+//
+// For now it only handles UInt (unsigned int) as that's all Google Test
+// needs.  Other types can be easily added in the future if need
+// arises.
+template <size_t size>
+class TypeWithSize {
+ public:
+  // This prevents the user from using TypeWithSize<N> with incorrect
+  // values of N.
+  typedef void UInt;
+};
+
+// The specialization for size 4.
+template <>
+class TypeWithSize<4> {
+ public:
+  // unsigned int has size 4 in both gcc and MSVC.
+  //
+  // As base/basictypes.h doesn't compile on Windows, we cannot use
+  // uint32, uint64, and etc here.
+  typedef int Int;
+  typedef unsigned int UInt;
+};
+
+// The specialization for size 8.
+template <>
+class TypeWithSize<8> {
+ public:
+#if GTEST_OS_WINDOWS
+  typedef __int64 Int;
+  typedef unsigned __int64 UInt;
+#else
+  typedef long long Int;  // NOLINT
+  typedef unsigned long long UInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+};
+
+// Integer types of known sizes.
+typedef TypeWithSize<4>::Int Int32;
+typedef TypeWithSize<4>::UInt UInt32;
+typedef TypeWithSize<8>::Int Int64;
+typedef TypeWithSize<8>::UInt UInt64;
+typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
+
+// Utilities for command line flags and environment variables.
+
+// Macro for referencing flags.
+#if !defined(GTEST_FLAG)
+# define GTEST_FLAG(name) FLAGS_gtest_##name
+#endif  // !defined(GTEST_FLAG)
+
+#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+#endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+
+#if !defined(GTEST_DECLARE_bool_)
+# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+
+// Macros for declaring flags.
+# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
+# define GTEST_DECLARE_int32_(name) \
+    GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
+#define GTEST_DECLARE_string_(name) \
+    GTEST_API_ extern ::std::string GTEST_FLAG(name)
+
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+    GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+
+#endif  // !defined(GTEST_DECLARE_bool_)
+
+// Thread annotations
+#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+# define GTEST_LOCK_EXCLUDED_(locks)
+#endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
+// to *value and returns true; otherwise leaves *value unchanged and returns
+// false.
+// TODO(chandlerc): Find a better way to refactor flag and environment parsing
+// out of both gtest-port.cc and gtest.cc to avoid exporting this utility
+// function.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value);
+
+// Parses a bool/Int32/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromGTestEnv(const char* flag, bool default_val);
+GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
+std::string StringFromGTestEnv(const char* flag, const char* default_val);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h
new file mode 100644
index 0000000000..97f1a7fdd2
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h
@@ -0,0 +1,167 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file declares the String class and functions used internally by
+// Google Test.  They are subject to change without notice. They should not used
+// by code external to Google Test.
+//
+// This header file is #included by <gtest/internal/gtest-internal.h>.
+// It should not be #included by other files.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+
+#ifdef __BORLANDC__
+// string.h is not guaranteed to provide strcpy on C++ Builder.
+# include <mem.h>
+#endif
+
+#include <string.h>
+#include <string>
+
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+namespace internal {
+
+// String - an abstract class holding static string utilities.
+class GTEST_API_ String {
+ public:
+  // Static utility methods
+
+  // Clones a 0-terminated C string, allocating memory using new.  The
+  // caller is responsible for deleting the return value using
+  // delete[].  Returns the cloned string, or NULL if the input is
+  // NULL.
+  //
+  // This is different from strdup() in string.h, which allocates
+  // memory using malloc().
+  static const char* CloneCString(const char* c_str);
+
+#if GTEST_OS_WINDOWS_MOBILE
+  // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
+  // able to pass strings to Win32 APIs on CE we need to convert them
+  // to 'Unicode', UTF-16.
+
+  // Creates a UTF-16 wide string from the given ANSI string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the wide string, or NULL if the
+  // input is NULL.
+  //
+  // The wide string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static LPCWSTR AnsiToUtf16(const char* c_str);
+
+  // Creates an ANSI string from the given wide string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the ANSI string, or NULL if the
+  // input is NULL.
+  //
+  // The returned string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
+#endif
+
+  // Compares two C strings.  Returns true iff they have the same content.
+  //
+  // Unlike strcmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CStringEquals(const char* lhs, const char* rhs);
+
+  // Converts a wide C string to a String using the UTF-8 encoding.
+  // NULL will be converted to "(null)".  If an error occurred during
+  // the conversion, "(failed to convert from wide string)" is
+  // returned.
+  static std::string ShowWideCString(const wchar_t* wide_c_str);
+
+  // Compares two wide C strings.  Returns true iff they have the same
+  // content.
+  //
+  // Unlike wcscmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
+
+  // Compares two C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike strcasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CaseInsensitiveCStringEquals(const char* lhs,
+                                           const char* rhs);
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                               const wchar_t* rhs);
+
+  // Returns true iff the given string ends with the given suffix, ignoring
+  // case. Any string is considered to end with an empty suffix.
+  static bool EndsWithCaseInsensitive(
+      const std::string& str, const std::string& suffix);
+
+  // Formats an int value as "%02d".
+  static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
+
+  // Formats an int value as "%X".
+  static std::string FormatHexInt(int value);
+
+  // Formats a byte as "%02X".
+  static std::string FormatByte(unsigned char value);
+
+ private:
+  String();  // Not meant to be instantiated.
+};  // class String
+
+// Gets the content of the stringstream's buffer as an std::string.  Each '\0'
+// character in the buffer is replaced with "\\0".
+GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h
new file mode 100644
index 0000000000..e9b405340a
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h
@@ -0,0 +1,1020 @@
+// This file was GENERATED by command:
+//     pump.py gtest-tuple.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2009 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+
+#include <utility>  // For ::std::pair.
+
+// The compiler used in Symbian has a bug that prevents us from declaring the
+// tuple template as a friend (it complains that tuple is redefined).  This
+// hack bypasses the bug by declaring the members that should otherwise be
+// private as public.
+// Sun Studio versions < 12 also have the above bug.
+#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
+#else
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
+    template <GTEST_10_TYPENAMES_(U)> friend class tuple; \
+   private:
+#endif
+
+// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict
+// with our own definitions. Therefore using our own tuple does not work on
+// those compilers.
+#if defined(_MSC_VER) && _MSC_VER >= 1600  /* 1600 is Visual Studio 2010 */
+# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \
+GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers."
+#endif
+
+// GTEST_n_TUPLE_(T) is the type of an n-tuple.
+#define GTEST_0_TUPLE_(T) tuple<>
+#define GTEST_1_TUPLE_(T) tuple<T##0, void, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_2_TUPLE_(T) tuple<T##0, T##1, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_3_TUPLE_(T) tuple<T##0, T##1, T##2, void, void, void, void, \
+    void, void, void>
+#define GTEST_4_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, void, void, void, \
+    void, void, void>
+#define GTEST_5_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, void, void, \
+    void, void, void>
+#define GTEST_6_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, void, \
+    void, void, void>
+#define GTEST_7_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    void, void, void>
+#define GTEST_8_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, void, void>
+#define GTEST_9_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, void>
+#define GTEST_10_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, T##9>
+
+// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
+#define GTEST_0_TYPENAMES_(T)
+#define GTEST_1_TYPENAMES_(T) typename T##0
+#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1
+#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2
+#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3
+#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4
+#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5
+#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6
+#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, typename T##7
+#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8
+#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8, typename T##9
+
+// In theory, defining stuff in the ::std namespace is undefined
+// behavior.  We can do this as we are playing the role of a standard
+// library vendor.
+namespace std {
+namespace tr1 {
+
+template <typename T0 = void, typename T1 = void, typename T2 = void,
+    typename T3 = void, typename T4 = void, typename T5 = void,
+    typename T6 = void, typename T7 = void, typename T8 = void,
+    typename T9 = void>
+class tuple;
+
+// Anything in namespace gtest_internal is Google Test's INTERNAL
+// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
+namespace gtest_internal {
+
+// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
+template <typename T>
+struct ByRef { typedef const T& type; };  // NOLINT
+template <typename T>
+struct ByRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for ByRef.
+#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
+
+// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
+// is the same as tr1::add_reference<T>::type.
+template <typename T>
+struct AddRef { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for AddRef.
+#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
+
+// A helper for implementing get<k>().
+template <int k> class Get;
+
+// A helper for implementing tuple_element<k, T>.  kIndexValid is true
+// iff k < the number of fields in tuple type T.
+template <bool kIndexValid, int kIndex, class Tuple>
+struct TupleElement;
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 0, GTEST_10_TUPLE_(T) > {
+  typedef T0 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 1, GTEST_10_TUPLE_(T) > {
+  typedef T1 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 2, GTEST_10_TUPLE_(T) > {
+  typedef T2 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 3, GTEST_10_TUPLE_(T) > {
+  typedef T3 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 4, GTEST_10_TUPLE_(T) > {
+  typedef T4 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 5, GTEST_10_TUPLE_(T) > {
+  typedef T5 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 6, GTEST_10_TUPLE_(T) > {
+  typedef T6 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 7, GTEST_10_TUPLE_(T) > {
+  typedef T7 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 8, GTEST_10_TUPLE_(T) > {
+  typedef T8 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 9, GTEST_10_TUPLE_(T) > {
+  typedef T9 type;
+};
+
+}  // namespace gtest_internal
+
+template <>
+class tuple<> {
+ public:
+  tuple() {}
+  tuple(const tuple& /* t */)  {}
+  tuple& operator=(const tuple& /* t */) { return *this; }
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+class GTEST_1_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {}
+
+  tuple(const tuple& t) : f0_(t.f0_) {}
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_1_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    return *this;
+  }
+
+  T0 f0_;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+class GTEST_2_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0),
+      f1_(f1) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {}
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {}
+  template <typename U0, typename U1>
+  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_2_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+  template <typename U0, typename U1>
+  tuple& operator=(const ::std::pair<U0, U1>& p) {
+    f0_ = p.first;
+    f1_ = p.second;
+    return *this;
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+class GTEST_3_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_3_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+class GTEST_4_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {}
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_4_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+class GTEST_5_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3,
+      GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_) {}
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_5_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+class GTEST_6_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_) {}
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_6_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+class GTEST_7_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_7_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+class GTEST_8_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6,
+      GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_8_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+class GTEST_9_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7), f8_(f8) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_9_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+class tuple {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(),
+      f9_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {}
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_),
+      f9_(t.f9_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_10_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    f9_ = t.f9_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+  T9 f9_;
+};
+
+// 6.1.3.2 Tuple creation functions.
+
+// Known limitations: we don't support passing an
+// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
+// implement tie().
+
+inline tuple<> make_tuple() { return tuple<>(); }
+
+template <GTEST_1_TYPENAMES_(T)>
+inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) {
+  return GTEST_1_TUPLE_(T)(f0);
+}
+
+template <GTEST_2_TYPENAMES_(T)>
+inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) {
+  return GTEST_2_TUPLE_(T)(f0, f1);
+}
+
+template <GTEST_3_TYPENAMES_(T)>
+inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) {
+  return GTEST_3_TUPLE_(T)(f0, f1, f2);
+}
+
+template <GTEST_4_TYPENAMES_(T)>
+inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3) {
+  return GTEST_4_TUPLE_(T)(f0, f1, f2, f3);
+}
+
+template <GTEST_5_TYPENAMES_(T)>
+inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4) {
+  return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4);
+}
+
+template <GTEST_6_TYPENAMES_(T)>
+inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5) {
+  return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5);
+}
+
+template <GTEST_7_TYPENAMES_(T)>
+inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6) {
+  return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6);
+}
+
+template <GTEST_8_TYPENAMES_(T)>
+inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) {
+  return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7);
+}
+
+template <GTEST_9_TYPENAMES_(T)>
+inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8) {
+  return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8);
+}
+
+template <GTEST_10_TYPENAMES_(T)>
+inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8, const T9& f9) {
+  return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9);
+}
+
+// 6.1.3.3 Tuple helper classes.
+
+template <typename Tuple> struct tuple_size;
+
+template <GTEST_0_TYPENAMES_(T)>
+struct tuple_size<GTEST_0_TUPLE_(T) > {
+  static const int value = 0;
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+struct tuple_size<GTEST_1_TUPLE_(T) > {
+  static const int value = 1;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+struct tuple_size<GTEST_2_TUPLE_(T) > {
+  static const int value = 2;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+struct tuple_size<GTEST_3_TUPLE_(T) > {
+  static const int value = 3;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+struct tuple_size<GTEST_4_TUPLE_(T) > {
+  static const int value = 4;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+struct tuple_size<GTEST_5_TUPLE_(T) > {
+  static const int value = 5;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+struct tuple_size<GTEST_6_TUPLE_(T) > {
+  static const int value = 6;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+struct tuple_size<GTEST_7_TUPLE_(T) > {
+  static const int value = 7;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+struct tuple_size<GTEST_8_TUPLE_(T) > {
+  static const int value = 8;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+struct tuple_size<GTEST_9_TUPLE_(T) > {
+  static const int value = 9;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct tuple_size<GTEST_10_TUPLE_(T) > {
+  static const int value = 10;
+};
+
+template <int k, class Tuple>
+struct tuple_element {
+  typedef typename gtest_internal::TupleElement<
+      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
+};
+
+#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
+
+// 6.1.3.4 Element access.
+
+namespace gtest_internal {
+
+template <>
+class Get<0> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  Field(Tuple& t) { return t.f0_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  ConstField(const Tuple& t) { return t.f0_; }
+};
+
+template <>
+class Get<1> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  Field(Tuple& t) { return t.f1_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  ConstField(const Tuple& t) { return t.f1_; }
+};
+
+template <>
+class Get<2> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  Field(Tuple& t) { return t.f2_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  ConstField(const Tuple& t) { return t.f2_; }
+};
+
+template <>
+class Get<3> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  Field(Tuple& t) { return t.f3_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  ConstField(const Tuple& t) { return t.f3_; }
+};
+
+template <>
+class Get<4> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  Field(Tuple& t) { return t.f4_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  ConstField(const Tuple& t) { return t.f4_; }
+};
+
+template <>
+class Get<5> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  Field(Tuple& t) { return t.f5_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  ConstField(const Tuple& t) { return t.f5_; }
+};
+
+template <>
+class Get<6> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  Field(Tuple& t) { return t.f6_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  ConstField(const Tuple& t) { return t.f6_; }
+};
+
+template <>
+class Get<7> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  Field(Tuple& t) { return t.f7_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  ConstField(const Tuple& t) { return t.f7_; }
+};
+
+template <>
+class Get<8> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  Field(Tuple& t) { return t.f8_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  ConstField(const Tuple& t) { return t.f8_; }
+};
+
+template <>
+class Get<9> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  Field(Tuple& t) { return t.f9_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  ConstField(const Tuple& t) { return t.f9_; }
+};
+
+}  // namespace gtest_internal
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T)))
+get(GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::Field(t);
+}
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_10_TUPLE_(T)))
+get(const GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::ConstField(t);
+}
+
+// 6.1.3.5 Relational operators
+
+// We only implement == and !=, as we don't have a need for the rest yet.
+
+namespace gtest_internal {
+
+// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
+// first k fields of t1 equals the first k fields of t2.
+// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
+// k1 != k2.
+template <int kSize1, int kSize2>
+struct SameSizeTuplePrefixComparator;
+
+template <>
+struct SameSizeTuplePrefixComparator<0, 0> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
+    return true;
+  }
+};
+
+template <int k>
+struct SameSizeTuplePrefixComparator<k, k> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
+    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
+        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
+  }
+};
+
+}  // namespace gtest_internal
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator==(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) {
+  return gtest_internal::SameSizeTuplePrefixComparator<
+      tuple_size<GTEST_10_TUPLE_(T) >::value,
+      tuple_size<GTEST_10_TUPLE_(U) >::value>::Eq(t, u);
+}
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator!=(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) { return !(t == u); }
+
+// 6.1.4 Pairs.
+// Unimplemented.
+
+}  // namespace tr1
+}  // namespace std
+
+#undef GTEST_0_TUPLE_
+#undef GTEST_1_TUPLE_
+#undef GTEST_2_TUPLE_
+#undef GTEST_3_TUPLE_
+#undef GTEST_4_TUPLE_
+#undef GTEST_5_TUPLE_
+#undef GTEST_6_TUPLE_
+#undef GTEST_7_TUPLE_
+#undef GTEST_8_TUPLE_
+#undef GTEST_9_TUPLE_
+#undef GTEST_10_TUPLE_
+
+#undef GTEST_0_TYPENAMES_
+#undef GTEST_1_TYPENAMES_
+#undef GTEST_2_TYPENAMES_
+#undef GTEST_3_TYPENAMES_
+#undef GTEST_4_TYPENAMES_
+#undef GTEST_5_TYPENAMES_
+#undef GTEST_6_TYPENAMES_
+#undef GTEST_7_TYPENAMES_
+#undef GTEST_8_TYPENAMES_
+#undef GTEST_9_TYPENAMES_
+#undef GTEST_10_TYPENAMES_
+
+#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
+#undef GTEST_BY_REF_
+#undef GTEST_ADD_REF_
+#undef GTEST_TUPLE_ELEMENT_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump
new file mode 100644
index 0000000000..429ddfeeca
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump
@@ -0,0 +1,347 @@
+$$ -*- mode: c++; -*-
+$var n = 10  $$ Maximum number of tuple fields we want to support.
+$$ This meta comment fixes auto-indentation in Emacs. }}
+// Copyright 2009 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+
+#include <utility>  // For ::std::pair.
+
+// The compiler used in Symbian has a bug that prevents us from declaring the
+// tuple template as a friend (it complains that tuple is redefined).  This
+// hack bypasses the bug by declaring the members that should otherwise be
+// private as public.
+// Sun Studio versions < 12 also have the above bug.
+#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
+#else
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
+    template <GTEST_$(n)_TYPENAMES_(U)> friend class tuple; \
+   private:
+#endif
+
+// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict
+// with our own definitions. Therefore using our own tuple does not work on
+// those compilers.
+#if defined(_MSC_VER) && _MSC_VER >= 1600  /* 1600 is Visual Studio 2010 */
+# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \
+GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers."
+#endif
+
+
+$range i 0..n-1
+$range j 0..n
+$range k 1..n
+// GTEST_n_TUPLE_(T) is the type of an n-tuple.
+#define GTEST_0_TUPLE_(T) tuple<>
+
+$for k [[
+$range m 0..k-1
+$range m2 k..n-1
+#define GTEST_$(k)_TUPLE_(T) tuple<$for m, [[T##$m]]$for m2 [[, void]]>
+
+]]
+
+// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
+
+$for j [[
+$range m 0..j-1
+#define GTEST_$(j)_TYPENAMES_(T) $for m, [[typename T##$m]]
+
+
+]]
+
+// In theory, defining stuff in the ::std namespace is undefined
+// behavior.  We can do this as we are playing the role of a standard
+// library vendor.
+namespace std {
+namespace tr1 {
+
+template <$for i, [[typename T$i = void]]>
+class tuple;
+
+// Anything in namespace gtest_internal is Google Test's INTERNAL
+// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
+namespace gtest_internal {
+
+// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
+template <typename T>
+struct ByRef { typedef const T& type; };  // NOLINT
+template <typename T>
+struct ByRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for ByRef.
+#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
+
+// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
+// is the same as tr1::add_reference<T>::type.
+template <typename T>
+struct AddRef { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for AddRef.
+#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
+
+// A helper for implementing get<k>().
+template <int k> class Get;
+
+// A helper for implementing tuple_element<k, T>.  kIndexValid is true
+// iff k < the number of fields in tuple type T.
+template <bool kIndexValid, int kIndex, class Tuple>
+struct TupleElement;
+
+
+$for i [[
+template <GTEST_$(n)_TYPENAMES_(T)>
+struct TupleElement<true, $i, GTEST_$(n)_TUPLE_(T) > {
+  typedef T$i type;
+};
+
+
+]]
+}  // namespace gtest_internal
+
+template <>
+class tuple<> {
+ public:
+  tuple() {}
+  tuple(const tuple& /* t */)  {}
+  tuple& operator=(const tuple& /* t */) { return *this; }
+};
+
+
+$for k [[
+$range m 0..k-1
+template <GTEST_$(k)_TYPENAMES_(T)>
+class $if k < n [[GTEST_$(k)_TUPLE_(T)]] $else [[tuple]] {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : $for m, [[f$(m)_()]] {}
+
+  explicit tuple($for m, [[GTEST_BY_REF_(T$m) f$m]]) : [[]]
+$for m, [[f$(m)_(f$m)]] {}
+
+  tuple(const tuple& t) : $for m, [[f$(m)_(t.f$(m)_)]] {}
+
+  template <GTEST_$(k)_TYPENAMES_(U)>
+  tuple(const GTEST_$(k)_TUPLE_(U)& t) : $for m, [[f$(m)_(t.f$(m)_)]] {}
+
+$if k == 2 [[
+  template <typename U0, typename U1>
+  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
+
+]]
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_$(k)_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_$(k)_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+$if k == 2 [[
+  template <typename U0, typename U1>
+  tuple& operator=(const ::std::pair<U0, U1>& p) {
+    f0_ = p.first;
+    f1_ = p.second;
+    return *this;
+  }
+
+]]
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_$(k)_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_$(k)_TUPLE_(U)& t) {
+
+$for m [[
+    f$(m)_ = t.f$(m)_;
+
+]]
+    return *this;
+  }
+
+
+$for m [[
+  T$m f$(m)_;
+
+]]
+};
+
+
+]]
+// 6.1.3.2 Tuple creation functions.
+
+// Known limitations: we don't support passing an
+// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
+// implement tie().
+
+inline tuple<> make_tuple() { return tuple<>(); }
+
+$for k [[
+$range m 0..k-1
+
+template <GTEST_$(k)_TYPENAMES_(T)>
+inline GTEST_$(k)_TUPLE_(T) make_tuple($for m, [[const T$m& f$m]]) {
+  return GTEST_$(k)_TUPLE_(T)($for m, [[f$m]]);
+}
+
+]]
+
+// 6.1.3.3 Tuple helper classes.
+
+template <typename Tuple> struct tuple_size;
+
+
+$for j [[
+template <GTEST_$(j)_TYPENAMES_(T)>
+struct tuple_size<GTEST_$(j)_TUPLE_(T) > {
+  static const int value = $j;
+};
+
+
+]]
+template <int k, class Tuple>
+struct tuple_element {
+  typedef typename gtest_internal::TupleElement<
+      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
+};
+
+#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
+
+// 6.1.3.4 Element access.
+
+namespace gtest_internal {
+
+
+$for i [[
+template <>
+class Get<$i> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_($i, Tuple))
+  Field(Tuple& t) { return t.f$(i)_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_($i, Tuple))
+  ConstField(const Tuple& t) { return t.f$(i)_; }
+};
+
+
+]]
+}  // namespace gtest_internal
+
+template <int k, GTEST_$(n)_TYPENAMES_(T)>
+GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_$(n)_TUPLE_(T)))
+get(GTEST_$(n)_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::Field(t);
+}
+
+template <int k, GTEST_$(n)_TYPENAMES_(T)>
+GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_$(n)_TUPLE_(T)))
+get(const GTEST_$(n)_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::ConstField(t);
+}
+
+// 6.1.3.5 Relational operators
+
+// We only implement == and !=, as we don't have a need for the rest yet.
+
+namespace gtest_internal {
+
+// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
+// first k fields of t1 equals the first k fields of t2.
+// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
+// k1 != k2.
+template <int kSize1, int kSize2>
+struct SameSizeTuplePrefixComparator;
+
+template <>
+struct SameSizeTuplePrefixComparator<0, 0> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
+    return true;
+  }
+};
+
+template <int k>
+struct SameSizeTuplePrefixComparator<k, k> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
+    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
+        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
+  }
+};
+
+}  // namespace gtest_internal
+
+template <GTEST_$(n)_TYPENAMES_(T), GTEST_$(n)_TYPENAMES_(U)>
+inline bool operator==(const GTEST_$(n)_TUPLE_(T)& t,
+                       const GTEST_$(n)_TUPLE_(U)& u) {
+  return gtest_internal::SameSizeTuplePrefixComparator<
+      tuple_size<GTEST_$(n)_TUPLE_(T) >::value,
+      tuple_size<GTEST_$(n)_TUPLE_(U) >::value>::Eq(t, u);
+}
+
+template <GTEST_$(n)_TYPENAMES_(T), GTEST_$(n)_TYPENAMES_(U)>
+inline bool operator!=(const GTEST_$(n)_TUPLE_(T)& t,
+                       const GTEST_$(n)_TUPLE_(U)& u) { return !(t == u); }
+
+// 6.1.4 Pairs.
+// Unimplemented.
+
+}  // namespace tr1
+}  // namespace std
+
+
+$for j [[
+#undef GTEST_$(j)_TUPLE_
+
+]]
+
+
+$for j [[
+#undef GTEST_$(j)_TYPENAMES_
+
+]]
+
+#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
+#undef GTEST_BY_REF_
+#undef GTEST_ADD_REF_
+#undef GTEST_TUPLE_ELEMENT_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
new file mode 100644
index 0000000000..e46f7cfcb4
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
@@ -0,0 +1,3331 @@
+// This file was GENERATED by command:
+//     pump.py gtest-type-util.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Type utilities needed for implementing typed and type-parameterized
+// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently we support at most 50 types in a list, and at most 50
+// type-parameterized tests in one type-parameterized test case.
+// Please contact googletestframework@googlegroups.com if you need
+// more.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
+// libstdc++ (which is where cxxabi.h comes from).
+# if GTEST_HAS_CXXABI_H_
+#  include <cxxabi.h>
+# elif defined(__HP_aCC)
+#  include <acxx_demangle.h>
+# endif  // GTEST_HASH_CXXABI_H_
+
+namespace testing {
+namespace internal {
+
+// GetTypeName<T>() returns a human-readable name of type T.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+# if GTEST_HAS_RTTI
+
+  const char* const name = typeid(T).name();
+#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+  int status = 0;
+  // gcc's implementation of typeid(T).name() mangles the type name,
+  // so we have to demangle it.
+#   if GTEST_HAS_CXXABI_H_
+  using abi::__cxa_demangle;
+#   endif  // GTEST_HAS_CXXABI_H_
+  char* const readable_name = __cxa_demangle(name, 0, 0, &status);
+  const std::string name_str(status == 0 ? readable_name : name);
+  free(readable_name);
+  return name_str;
+#  else
+  return name;
+#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+
+# else
+
+  return "<type>";
+
+# endif  // GTEST_HAS_RTTI
+}
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
+// type.  This can be used as a compile-time assertion to ensure that
+// two types are equal.
+
+template <typename T1, typename T2>
+struct AssertTypeEq;
+
+template <typename T>
+struct AssertTypeEq<T, T> {
+  typedef bool type;
+};
+
+// A unique type used as the default value for the arguments of class
+// template Types.  This allows us to simulate variadic templates
+// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
+// support directly.
+struct None {};
+
+// The following family of struct and struct templates are used to
+// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
+// represents a type list with N types (T1, T2, ..., and TN) in it.
+// Except for Types0, every struct in the family has two member types:
+// Head for the first type in the list, and Tail for the rest of the
+// list.
+
+// The empty type list.
+struct Types0 {};
+
+// Type lists of length 1, 2, 3, and so on.
+
+template <typename T1>
+struct Types1 {
+  typedef T1 Head;
+  typedef Types0 Tail;
+};
+template <typename T1, typename T2>
+struct Types2 {
+  typedef T1 Head;
+  typedef Types1<T2> Tail;
+};
+
+template <typename T1, typename T2, typename T3>
+struct Types3 {
+  typedef T1 Head;
+  typedef Types2<T2, T3> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types4 {
+  typedef T1 Head;
+  typedef Types3<T2, T3, T4> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types5 {
+  typedef T1 Head;
+  typedef Types4<T2, T3, T4, T5> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types6 {
+  typedef T1 Head;
+  typedef Types5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types7 {
+  typedef T1 Head;
+  typedef Types6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types8 {
+  typedef T1 Head;
+  typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types9 {
+  typedef T1 Head;
+  typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types10 {
+  typedef T1 Head;
+  typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types11 {
+  typedef T1 Head;
+  typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types12 {
+  typedef T1 Head;
+  typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types13 {
+  typedef T1 Head;
+  typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types14 {
+  typedef T1 Head;
+  typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types15 {
+  typedef T1 Head;
+  typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types16 {
+  typedef T1 Head;
+  typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types17 {
+  typedef T1 Head;
+  typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types18 {
+  typedef T1 Head;
+  typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types19 {
+  typedef T1 Head;
+  typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types20 {
+  typedef T1 Head;
+  typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types21 {
+  typedef T1 Head;
+  typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types22 {
+  typedef T1 Head;
+  typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types23 {
+  typedef T1 Head;
+  typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types24 {
+  typedef T1 Head;
+  typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types25 {
+  typedef T1 Head;
+  typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types26 {
+  typedef T1 Head;
+  typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types27 {
+  typedef T1 Head;
+  typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types28 {
+  typedef T1 Head;
+  typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types29 {
+  typedef T1 Head;
+  typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types30 {
+  typedef T1 Head;
+  typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types31 {
+  typedef T1 Head;
+  typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types32 {
+  typedef T1 Head;
+  typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types33 {
+  typedef T1 Head;
+  typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types34 {
+  typedef T1 Head;
+  typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types35 {
+  typedef T1 Head;
+  typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types36 {
+  typedef T1 Head;
+  typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types37 {
+  typedef T1 Head;
+  typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types38 {
+  typedef T1 Head;
+  typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types39 {
+  typedef T1 Head;
+  typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types40 {
+  typedef T1 Head;
+  typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types41 {
+  typedef T1 Head;
+  typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types42 {
+  typedef T1 Head;
+  typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types43 {
+  typedef T1 Head;
+  typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types44 {
+  typedef T1 Head;
+  typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types45 {
+  typedef T1 Head;
+  typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types46 {
+  typedef T1 Head;
+  typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types47 {
+  typedef T1 Head;
+  typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types48 {
+  typedef T1 Head;
+  typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types49 {
+  typedef T1 Head;
+  typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct Types50 {
+  typedef T1 Head;
+  typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+}  // namespace internal
+
+// We don't want to require the users to write TypesN<...> directly,
+// as that would require them to count the length.  Types<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Types<int>
+// will appear as Types<int, None, None, ..., None> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Types<T1, ..., TN>, and Google Test will translate
+// that to TypesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Types template.
+template <typename T1 = internal::None, typename T2 = internal::None,
+    typename T3 = internal::None, typename T4 = internal::None,
+    typename T5 = internal::None, typename T6 = internal::None,
+    typename T7 = internal::None, typename T8 = internal::None,
+    typename T9 = internal::None, typename T10 = internal::None,
+    typename T11 = internal::None, typename T12 = internal::None,
+    typename T13 = internal::None, typename T14 = internal::None,
+    typename T15 = internal::None, typename T16 = internal::None,
+    typename T17 = internal::None, typename T18 = internal::None,
+    typename T19 = internal::None, typename T20 = internal::None,
+    typename T21 = internal::None, typename T22 = internal::None,
+    typename T23 = internal::None, typename T24 = internal::None,
+    typename T25 = internal::None, typename T26 = internal::None,
+    typename T27 = internal::None, typename T28 = internal::None,
+    typename T29 = internal::None, typename T30 = internal::None,
+    typename T31 = internal::None, typename T32 = internal::None,
+    typename T33 = internal::None, typename T34 = internal::None,
+    typename T35 = internal::None, typename T36 = internal::None,
+    typename T37 = internal::None, typename T38 = internal::None,
+    typename T39 = internal::None, typename T40 = internal::None,
+    typename T41 = internal::None, typename T42 = internal::None,
+    typename T43 = internal::None, typename T44 = internal::None,
+    typename T45 = internal::None, typename T46 = internal::None,
+    typename T47 = internal::None, typename T48 = internal::None,
+    typename T49 = internal::None, typename T50 = internal::None>
+struct Types {
+  typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Types<internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types0 type;
+};
+template <typename T1>
+struct Types<T1, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types1<T1> type;
+};
+template <typename T1, typename T2>
+struct Types<T1, T2, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types2<T1, T2> type;
+};
+template <typename T1, typename T2, typename T3>
+struct Types<T1, T2, T3, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types3<T1, T2, T3> type;
+};
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types4<T1, T2, T3, T4> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types<T1, T2, T3, T4, T5, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types5<T1, T2, T3, T4, T5> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types6<T1, T2, T3, T4, T5, T6> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, internal::None, internal::None, internal::None> {
+  typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, internal::None, internal::None> {
+  typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, T49, internal::None> {
+  typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+namespace internal {
+
+# define GTEST_TEMPLATE_ template <typename T> class
+
+// The template "selector" struct TemplateSel<Tmpl> is used to
+// represent Tmpl, which must be a class template with one type
+// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
+// as the type Tmpl<T>.  This allows us to actually instantiate the
+// template "selected" by TemplateSel<Tmpl>.
+//
+// This trick is necessary for simulating typedef for class templates,
+// which C++ doesn't support directly.
+template <GTEST_TEMPLATE_ Tmpl>
+struct TemplateSel {
+  template <typename T>
+  struct Bind {
+    typedef Tmpl<T> type;
+  };
+};
+
+# define GTEST_BIND_(TmplSel, T) \
+  TmplSel::template Bind<T>::type
+
+// A unique struct template used as the default value for the
+// arguments of class template Templates.  This allows us to simulate
+// variadic templates (e.g. Templates<int>, Templates<int, double>,
+// and etc), which C++ doesn't support directly.
+template <typename T>
+struct NoneT {};
+
+// The following family of struct and struct templates are used to
+// represent template lists.  In particular, TemplatesN<T1, T2, ...,
+// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
+// for Templates0, every struct in the family has two member types:
+// Head for the selector of the first template in the list, and Tail
+// for the rest of the list.
+
+// The empty template list.
+struct Templates0 {};
+
+// Template lists of length 1, 2, 3, and so on.
+
+template <GTEST_TEMPLATE_ T1>
+struct Templates1 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates0 Tail;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates2 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates1<T2> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates3 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates2<T2, T3> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates4 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates3<T2, T3, T4> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates5 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates4<T2, T3, T4, T5> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates6 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates7 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates8 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates9 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates10 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates11 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates12 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates13 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates14 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates15 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates16 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates17 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates18 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates19 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates20 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates21 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates22 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates23 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates24 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates25 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates26 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates27 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates28 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates29 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates30 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates31 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates32 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates33 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates34 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates35 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates36 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates37 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates38 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates39 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates40 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates41 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates42 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates43 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates44 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates45 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates46 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates47 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates48 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates49 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50>
+struct Templates50 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+// We don't want to require the users to write TemplatesN<...> directly,
+// as that would require them to count the length.  Templates<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Templates<list>
+// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Templates<T1, ..., TN>, and Google Test will translate
+// that to TemplatesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Templates template.
+template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT,
+    GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT,
+    GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT,
+    GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT,
+    GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT,
+    GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT,
+    GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT,
+    GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT,
+    GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT,
+    GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT,
+    GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT,
+    GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT,
+    GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT,
+    GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT,
+    GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT,
+    GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT,
+    GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT,
+    GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT,
+    GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT,
+    GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT,
+    GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT,
+    GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT,
+    GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT,
+    GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT,
+    GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT>
+struct Templates {
+  typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates0 type;
+};
+template <GTEST_TEMPLATE_ T1>
+struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates1<T1> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates2<T1, T2> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates3<T1, T2, T3> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates4<T1, T2, T3, T4> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates5<T1, T2, T3, T4, T5> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates6<T1, T2, T3, T4, T5, T6> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, NoneT, NoneT, NoneT> {
+  typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, NoneT, NoneT> {
+  typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, T49, NoneT> {
+  typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+// The TypeList template makes it possible to use either a single type
+// or a Types<...> list in TYPED_TEST_CASE() and
+// INSTANTIATE_TYPED_TEST_CASE_P().
+
+template <typename T>
+struct TypeList {
+  typedef Types1<T> type;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> > {
+  typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type;
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump
new file mode 100644
index 0000000000..251fdf025b
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump
@@ -0,0 +1,297 @@
+$$ -*- mode: c++; -*-
+$var n = 50  $$ Maximum length of type lists we want to support.
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Type utilities needed for implementing typed and type-parameterized
+// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently we support at most $n types in a list, and at most $n
+// type-parameterized tests in one type-parameterized test case.
+// Please contact googletestframework@googlegroups.com if you need
+// more.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
+// libstdc++ (which is where cxxabi.h comes from).
+# if GTEST_HAS_CXXABI_H_
+#  include <cxxabi.h>
+# elif defined(__HP_aCC)
+#  include <acxx_demangle.h>
+# endif  // GTEST_HASH_CXXABI_H_
+
+namespace testing {
+namespace internal {
+
+// GetTypeName<T>() returns a human-readable name of type T.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+# if GTEST_HAS_RTTI
+
+  const char* const name = typeid(T).name();
+#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+  int status = 0;
+  // gcc's implementation of typeid(T).name() mangles the type name,
+  // so we have to demangle it.
+#   if GTEST_HAS_CXXABI_H_
+  using abi::__cxa_demangle;
+#   endif  // GTEST_HAS_CXXABI_H_
+  char* const readable_name = __cxa_demangle(name, 0, 0, &status);
+  const std::string name_str(status == 0 ? readable_name : name);
+  free(readable_name);
+  return name_str;
+#  else
+  return name;
+#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+
+# else
+
+  return "<type>";
+
+# endif  // GTEST_HAS_RTTI
+}
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
+// type.  This can be used as a compile-time assertion to ensure that
+// two types are equal.
+
+template <typename T1, typename T2>
+struct AssertTypeEq;
+
+template <typename T>
+struct AssertTypeEq<T, T> {
+  typedef bool type;
+};
+
+// A unique type used as the default value for the arguments of class
+// template Types.  This allows us to simulate variadic templates
+// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
+// support directly.
+struct None {};
+
+// The following family of struct and struct templates are used to
+// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
+// represents a type list with N types (T1, T2, ..., and TN) in it.
+// Except for Types0, every struct in the family has two member types:
+// Head for the first type in the list, and Tail for the rest of the
+// list.
+
+// The empty type list.
+struct Types0 {};
+
+// Type lists of length 1, 2, 3, and so on.
+
+template <typename T1>
+struct Types1 {
+  typedef T1 Head;
+  typedef Types0 Tail;
+};
+
+$range i 2..n
+
+$for i [[
+$range j 1..i
+$range k 2..i
+template <$for j, [[typename T$j]]>
+struct Types$i {
+  typedef T1 Head;
+  typedef Types$(i-1)<$for k, [[T$k]]> Tail;
+};
+
+
+]]
+
+}  // namespace internal
+
+// We don't want to require the users to write TypesN<...> directly,
+// as that would require them to count the length.  Types<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Types<int>
+// will appear as Types<int, None, None, ..., None> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Types<T1, ..., TN>, and Google Test will translate
+// that to TypesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Types template.
+
+$range i 1..n
+template <$for i, [[typename T$i = internal::None]]>
+struct Types {
+  typedef internal::Types$n<$for i, [[T$i]]> type;
+};
+
+template <>
+struct Types<$for i, [[internal::None]]> {
+  typedef internal::Types0 type;
+};
+
+$range i 1..n-1
+$for i [[
+$range j 1..i
+$range k i+1..n
+template <$for j, [[typename T$j]]>
+struct Types<$for j, [[T$j]]$for k[[, internal::None]]> {
+  typedef internal::Types$i<$for j, [[T$j]]> type;
+};
+
+]]
+
+namespace internal {
+
+# define GTEST_TEMPLATE_ template <typename T> class
+
+// The template "selector" struct TemplateSel<Tmpl> is used to
+// represent Tmpl, which must be a class template with one type
+// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
+// as the type Tmpl<T>.  This allows us to actually instantiate the
+// template "selected" by TemplateSel<Tmpl>.
+//
+// This trick is necessary for simulating typedef for class templates,
+// which C++ doesn't support directly.
+template <GTEST_TEMPLATE_ Tmpl>
+struct TemplateSel {
+  template <typename T>
+  struct Bind {
+    typedef Tmpl<T> type;
+  };
+};
+
+# define GTEST_BIND_(TmplSel, T) \
+  TmplSel::template Bind<T>::type
+
+// A unique struct template used as the default value for the
+// arguments of class template Templates.  This allows us to simulate
+// variadic templates (e.g. Templates<int>, Templates<int, double>,
+// and etc), which C++ doesn't support directly.
+template <typename T>
+struct NoneT {};
+
+// The following family of struct and struct templates are used to
+// represent template lists.  In particular, TemplatesN<T1, T2, ...,
+// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
+// for Templates0, every struct in the family has two member types:
+// Head for the selector of the first template in the list, and Tail
+// for the rest of the list.
+
+// The empty template list.
+struct Templates0 {};
+
+// Template lists of length 1, 2, 3, and so on.
+
+template <GTEST_TEMPLATE_ T1>
+struct Templates1 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates0 Tail;
+};
+
+$range i 2..n
+
+$for i [[
+$range j 1..i
+$range k 2..i
+template <$for j, [[GTEST_TEMPLATE_ T$j]]>
+struct Templates$i {
+  typedef TemplateSel<T1> Head;
+  typedef Templates$(i-1)<$for k, [[T$k]]> Tail;
+};
+
+
+]]
+
+// We don't want to require the users to write TemplatesN<...> directly,
+// as that would require them to count the length.  Templates<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Templates<list>
+// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Templates<T1, ..., TN>, and Google Test will translate
+// that to TemplatesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Templates template.
+
+$range i 1..n
+template <$for i, [[GTEST_TEMPLATE_ T$i = NoneT]]>
+struct Templates {
+  typedef Templates$n<$for i, [[T$i]]> type;
+};
+
+template <>
+struct Templates<$for i, [[NoneT]]> {
+  typedef Templates0 type;
+};
+
+$range i 1..n-1
+$for i [[
+$range j 1..i
+$range k i+1..n
+template <$for j, [[GTEST_TEMPLATE_ T$j]]>
+struct Templates<$for j, [[T$j]]$for k[[, NoneT]]> {
+  typedef Templates$i<$for j, [[T$j]]> type;
+};
+
+]]
+
+// The TypeList template makes it possible to use either a single type
+// or a Types<...> list in TYPED_TEST_CASE() and
+// INSTANTIATE_TYPED_TEST_CASE_P().
+
+template <typename T>
+struct TypeList {
+  typedef Types1<T> type;
+};
+
+
+$range i 1..n
+template <$for i, [[typename T$i]]>
+struct TypeList<Types<$for i, [[T$i]]> > {
+  typedef typename Types<$for i, [[T$i]]>::type type;
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-all.cc b/libs/libvpx/third_party/googletest/src/src/gtest-all.cc
index 912868148e..0a9cee5223 100644
--- a/libs/libvpx/third_party/googletest/src/src/gtest-all.cc
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-all.cc
@@ -39,9554 +39,10 @@
 #include "gtest/gtest.h"
 
 // The following lines pull in the real gtest *.cc files.
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
-
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// Utilities for testing Google Test itself and code that uses Google Test
-// (e.g. frameworks built on top of Google Test).
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-
-
-namespace testing {
-
-// This helper class can be used to mock out Google Test failure reporting
-// so that we can test Google Test or code that builds on Google Test.
-//
-// An object of this class appends a TestPartResult object to the
-// TestPartResultArray object given in the constructor whenever a Google Test
-// failure is reported. It can either intercept only failures that are
-// generated in the same thread that created this object or it can intercept
-// all generated failures. The scope of this mock object can be controlled with
-// the second argument to the two arguments constructor.
-class GTEST_API_ ScopedFakeTestPartResultReporter
-    : public TestPartResultReporterInterface {
- public:
-  // The two possible mocking modes of this object.
-  enum InterceptMode {
-    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
-    INTERCEPT_ALL_THREADS           // Intercepts all failures.
-  };
-
-  // The c'tor sets this object as the test part result reporter used
-  // by Google Test.  The 'result' parameter specifies where to report the
-  // results. This reporter will only catch failures generated in the current
-  // thread. DEPRECATED
-  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
-
-  // Same as above, but you can choose the interception scope of this object.
-  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
-                                   TestPartResultArray* result);
-
-  // The d'tor restores the previous test part result reporter.
-  virtual ~ScopedFakeTestPartResultReporter();
-
-  // Appends the TestPartResult object to the TestPartResultArray
-  // received in the constructor.
-  //
-  // This method is from the TestPartResultReporterInterface
-  // interface.
-  virtual void ReportTestPartResult(const TestPartResult& result);
- private:
-  void Init();
-
-  const InterceptMode intercept_mode_;
-  TestPartResultReporterInterface* old_reporter_;
-  TestPartResultArray* const result_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
-};
-
-namespace internal {
-
-// A helper class for implementing EXPECT_FATAL_FAILURE() and
-// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
-// TestPartResultArray contains exactly one failure that has the given
-// type and contains the given substring.  If that's not the case, a
-// non-fatal failure will be generated.
-class GTEST_API_ SingleFailureChecker {
- public:
-  // The constructor remembers the arguments.
-  SingleFailureChecker(const TestPartResultArray* results,
-                       TestPartResult::Type type,
-                       const string& substr);
-  ~SingleFailureChecker();
- private:
-  const TestPartResultArray* const results_;
-  const TestPartResult::Type type_;
-  const string substr_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
-};
-
-}  // namespace internal
-
-}  // namespace testing
-
-// A set of macros for testing Google Test assertions or code that's expected
-// to generate Google Test fatal failures.  It verifies that the given
-// statement will cause exactly one fatal Google Test failure with 'substr'
-// being part of the failure message.
-//
-// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
-// affects and considers failures generated in the current thread and
-// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
-//
-// The verification of the assertion is done correctly even when the statement
-// throws an exception or aborts the current function.
-//
-// Known restrictions:
-//   - 'statement' cannot reference local non-static variables or
-//     non-static members of the current object.
-//   - 'statement' cannot return a value.
-//   - You cannot stream a failure message to this macro.
-//
-// Note that even though the implementations of the following two
-// macros are much alike, we cannot refactor them to use a common
-// helper macro, due to some peculiarity in how the preprocessor
-// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
-// gtest_unittest.cc will fail to compile if we do that.
-#define EXPECT_FATAL_FAILURE(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ALL_THREADS, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-// A macro for testing Google Test assertions or code that's expected to
-// generate Google Test non-fatal failures.  It asserts that the given
-// statement will cause exactly one non-fatal Google Test failure with 'substr'
-// being part of the failure message.
-//
-// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
-// affects and considers failures generated in the current thread and
-// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
-//
-// 'statement' is allowed to reference local variables and members of
-// the current object.
-//
-// The verification of the assertion is done correctly even when the statement
-// throws an exception or aborts the current function.
-//
-// Known restrictions:
-//   - You cannot stream a failure message to this macro.
-//
-// Note that even though the implementations of the following two
-// macros are much alike, we cannot refactor them to use a common
-// helper macro, due to some peculiarity in how the preprocessor
-// works.  If we do that, the code won't compile when the user gives
-// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
-// expands to code containing an unprotected comma.  The
-// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
-// catches that.
-//
-// For the same reason, we have to write
-//   if (::testing::internal::AlwaysTrue()) { statement; }
-// instead of
-//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
-// to avoid an MSVC warning on unreachable code.
-#define EXPECT_NONFATAL_FAILURE(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
-          &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-
-#include <ctype.h>
-#include <math.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <wchar.h>
-#include <wctype.h>
-
-#include <algorithm>
-#include <iomanip>
-#include <limits>
-#include <ostream>  // NOLINT
-#include <sstream>
-#include <vector>
-
-#if GTEST_OS_LINUX
-
-// TODO(kenton@google.com): Use autoconf to detect availability of
-// gettimeofday().
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-
-# include <fcntl.h>  // NOLINT
-# include <limits.h>  // NOLINT
-# include <sched.h>  // NOLINT
-// Declares vsnprintf().  This header is not available on Windows.
-# include <strings.h>  // NOLINT
-# include <sys/mman.h>  // NOLINT
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-# include <string>
-
-#elif GTEST_OS_SYMBIAN
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-# include <sys/time.h>  // NOLINT
-
-#elif GTEST_OS_ZOS
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-# include <sys/time.h>  // NOLINT
-
-// On z/OS we additionally need strings.h for strcasecmp.
-# include <strings.h>  // NOLINT
-
-#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
-
-# include <windows.h>  // NOLINT
-
-#elif GTEST_OS_WINDOWS  // We are on Windows proper.
-
-# include <io.h>  // NOLINT
-# include <sys/timeb.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
-# include <sys/stat.h>  // NOLINT
-
-# if GTEST_OS_WINDOWS_MINGW
-// MinGW has gettimeofday() but not _ftime64().
-// TODO(kenton@google.com): Use autoconf to detect availability of
-//   gettimeofday().
-// TODO(kenton@google.com): There are other ways to get the time on
-//   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
-//   supports these.  consider using them instead.
-#  define GTEST_HAS_GETTIMEOFDAY_ 1
-#  include <sys/time.h>  // NOLINT
-# endif  // GTEST_OS_WINDOWS_MINGW
-
-// cpplint thinks that the header is already included, so we want to
-// silence it.
-# include <windows.h>  // NOLINT
-
-#else
-
-// Assume other platforms have gettimeofday().
-// TODO(kenton@google.com): Use autoconf to detect availability of
-//   gettimeofday().
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-
-// cpplint thinks that the header is already included, so we want to
-// silence it.
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-
-#endif  // GTEST_OS_LINUX
-
-#if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
-#endif
-
-#if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
-#endif
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Utility functions and classes used by the Google C++ testing framework.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// This file contains purely Google Test's internal implementation.  Please
-// DO NOT #INCLUDE IT IN A USER PROGRAM.
-
-#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
-#define GTEST_SRC_GTEST_INTERNAL_INL_H_
-
-// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
-// part of Google Test's implementation; otherwise it's undefined.
-#if !GTEST_IMPLEMENTATION_
-// A user is trying to include this from his code - just say no.
-# error "gtest-internal-inl.h is part of Google Test's internal implementation."
-# error "It must not be included except by Google Test itself."
-#endif  // GTEST_IMPLEMENTATION_
-
-#ifndef _WIN32_WCE
-# include <errno.h>
-#endif  // !_WIN32_WCE
-#include <stddef.h>
-#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
-#include <string.h>  // For memmove.
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-
-#if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
-#endif
-
-#if GTEST_OS_WINDOWS
-# include <windows.h>  // NOLINT
-#endif  // GTEST_OS_WINDOWS
-
-
-namespace testing {
-
-// Declares the flags.
-//
-// We don't want the users to modify this flag in the code, but want
-// Google Test's own unit tests to be able to access it. Therefore we
-// declare it here as opposed to in gtest.h.
-GTEST_DECLARE_bool_(death_test_use_fork);
-
-namespace internal {
-
-// The value of GetTestTypeId() as seen from within the Google Test
-// library.  This is solely for testing GetTestTypeId().
-GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
-
-// Names of the flags (needed for parsing Google Test flags).
-const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
-const char kBreakOnFailureFlag[] = "break_on_failure";
-const char kCatchExceptionsFlag[] = "catch_exceptions";
-const char kColorFlag[] = "color";
-const char kFilterFlag[] = "filter";
-const char kListTestsFlag[] = "list_tests";
-const char kOutputFlag[] = "output";
-const char kPrintTimeFlag[] = "print_time";
-const char kRandomSeedFlag[] = "random_seed";
-const char kRepeatFlag[] = "repeat";
-const char kShuffleFlag[] = "shuffle";
-const char kStackTraceDepthFlag[] = "stack_trace_depth";
-const char kStreamResultToFlag[] = "stream_result_to";
-const char kThrowOnFailureFlag[] = "throw_on_failure";
-
-// A valid random seed must be in [1, kMaxRandomSeed].
-const int kMaxRandomSeed = 99999;
-
-// g_help_flag is true iff the --help flag or an equivalent form is
-// specified on the command line.
-GTEST_API_ extern bool g_help_flag;
-
-// Returns the current time in milliseconds.
-GTEST_API_ TimeInMillis GetTimeInMillis();
-
-// Returns true iff Google Test should use colors in the output.
-GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
-
-// Formats the given time in milliseconds as seconds.
-GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
-
-// Converts the given time in milliseconds to a date string in the ISO 8601
-// format, without the timezone information.  N.B.: due to the use the
-// non-reentrant localtime() function, this function is not thread safe.  Do
-// not use it in any code that can be called from multiple threads.
-GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
-
-// Parses a string for an Int32 flag, in the form of "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-GTEST_API_ bool ParseInt32Flag(
-    const char* str, const char* flag, Int32* value);
-
-// Returns a random seed in range [1, kMaxRandomSeed] based on the
-// given --gtest_random_seed flag value.
-inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
-  const unsigned int raw_seed = (random_seed_flag == 0) ?
-      static_cast<unsigned int>(GetTimeInMillis()) :
-      static_cast<unsigned int>(random_seed_flag);
-
-  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
-  // it's easy to type.
-  const int normalized_seed =
-      static_cast<int>((raw_seed - 1U) %
-                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
-  return normalized_seed;
-}
-
-// Returns the first valid random seed after 'seed'.  The behavior is
-// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
-// considered to be 1.
-inline int GetNextRandomSeed(int seed) {
-  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
-      << "Invalid random seed " << seed << " - must be in [1, "
-      << kMaxRandomSeed << "].";
-  const int next_seed = seed + 1;
-  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
-}
-
-// This class saves the values of all Google Test flags in its c'tor, and
-// restores them in its d'tor.
-class GTestFlagSaver {
- public:
-  // The c'tor.
-  GTestFlagSaver() {
-    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
-    break_on_failure_ = GTEST_FLAG(break_on_failure);
-    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
-    color_ = GTEST_FLAG(color);
-    death_test_style_ = GTEST_FLAG(death_test_style);
-    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
-    filter_ = GTEST_FLAG(filter);
-    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
-    list_tests_ = GTEST_FLAG(list_tests);
-    output_ = GTEST_FLAG(output);
-    print_time_ = GTEST_FLAG(print_time);
-    random_seed_ = GTEST_FLAG(random_seed);
-    repeat_ = GTEST_FLAG(repeat);
-    shuffle_ = GTEST_FLAG(shuffle);
-    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
-    stream_result_to_ = GTEST_FLAG(stream_result_to);
-    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
-  }
-
-  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
-  ~GTestFlagSaver() {
-    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
-    GTEST_FLAG(break_on_failure) = break_on_failure_;
-    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
-    GTEST_FLAG(color) = color_;
-    GTEST_FLAG(death_test_style) = death_test_style_;
-    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
-    GTEST_FLAG(filter) = filter_;
-    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
-    GTEST_FLAG(list_tests) = list_tests_;
-    GTEST_FLAG(output) = output_;
-    GTEST_FLAG(print_time) = print_time_;
-    GTEST_FLAG(random_seed) = random_seed_;
-    GTEST_FLAG(repeat) = repeat_;
-    GTEST_FLAG(shuffle) = shuffle_;
-    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
-    GTEST_FLAG(stream_result_to) = stream_result_to_;
-    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
-  }
-
- private:
-  // Fields for saving the original values of flags.
-  bool also_run_disabled_tests_;
-  bool break_on_failure_;
-  bool catch_exceptions_;
-  std::string color_;
-  std::string death_test_style_;
-  bool death_test_use_fork_;
-  std::string filter_;
-  std::string internal_run_death_test_;
-  bool list_tests_;
-  std::string output_;
-  bool print_time_;
-  internal::Int32 random_seed_;
-  internal::Int32 repeat_;
-  bool shuffle_;
-  internal::Int32 stack_trace_depth_;
-  std::string stream_result_to_;
-  bool throw_on_failure_;
-} GTEST_ATTRIBUTE_UNUSED_;
-
-// Converts a Unicode code point to a narrow string in UTF-8 encoding.
-// code_point parameter is of type UInt32 because wchar_t may not be
-// wide enough to contain a code point.
-// If the code_point is not a valid Unicode code point
-// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
-// to "(Invalid Unicode 0xXXXXXXXX)".
-GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
-
-// Converts a wide string to a narrow string in UTF-8 encoding.
-// The wide string is assumed to have the following encoding:
-//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
-//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
-// Parameter str points to a null-terminated wide string.
-// Parameter num_chars may additionally limit the number
-// of wchar_t characters processed. -1 is used when the entire string
-// should be processed.
-// If the string contains code points that are not valid Unicode code points
-// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
-// and contains invalid UTF-16 surrogate pairs, values in those pairs
-// will be encoded as individual Unicode characters from Basic Normal Plane.
-GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
-
-// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
-// if the variable is present. If a file already exists at this location, this
-// function will write over it. If the variable is present, but the file cannot
-// be created, prints an error and exits.
-void WriteToShardStatusFileIfNeeded();
-
-// Checks whether sharding is enabled by examining the relevant
-// environment variable values. If the variables are present,
-// but inconsistent (e.g., shard_index >= total_shards), prints
-// an error and exits. If in_subprocess_for_death_test, sharding is
-// disabled because it must only be applied to the original test
-// process. Otherwise, we could filter out death tests we intended to execute.
-GTEST_API_ bool ShouldShard(const char* total_shards_str,
-                            const char* shard_index_str,
-                            bool in_subprocess_for_death_test);
-
-// Parses the environment variable var as an Int32. If it is unset,
-// returns default_val. If it is not an Int32, prints an error and
-// and aborts.
-GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
-
-// Given the total number of shards, the shard index, and the test id,
-// returns true iff the test should be run on this shard. The test id is
-// some arbitrary but unique non-negative integer assigned to each test
-// method. Assumes that 0 <= shard_index < total_shards.
-GTEST_API_ bool ShouldRunTestOnShard(
-    int total_shards, int shard_index, int test_id);
-
-// STL container utilities.
-
-// Returns the number of elements in the given container that satisfy
-// the given predicate.
-template <class Container, typename Predicate>
-inline int CountIf(const Container& c, Predicate predicate) {
-  // Implemented as an explicit loop since std::count_if() in libCstd on
-  // Solaris has a non-standard signature.
-  int count = 0;
-  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
-    if (predicate(*it))
-      ++count;
-  }
-  return count;
-}
-
-// Applies a function/functor to each element in the container.
-template <class Container, typename Functor>
-void ForEach(const Container& c, Functor functor) {
-  std::for_each(c.begin(), c.end(), functor);
-}
-
-// Returns the i-th element of the vector, or default_value if i is not
-// in range [0, v.size()).
-template <typename E>
-inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
-  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i];
-}
-
-// Performs an in-place shuffle of a range of the vector's elements.
-// 'begin' and 'end' are element indices as an STL-style range;
-// i.e. [begin, end) are shuffled, where 'end' == size() means to
-// shuffle to the end of the vector.
-template <typename E>
-void ShuffleRange(internal::Random* random, int begin, int end,
-                  std::vector<E>* v) {
-  const int size = static_cast<int>(v->size());
-  GTEST_CHECK_(0 <= begin && begin <= size)
-      << "Invalid shuffle range start " << begin << ": must be in range [0, "
-      << size << "].";
-  GTEST_CHECK_(begin <= end && end <= size)
-      << "Invalid shuffle range finish " << end << ": must be in range ["
-      << begin << ", " << size << "].";
-
-  // Fisher-Yates shuffle, from
-  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
-  for (int range_width = end - begin; range_width >= 2; range_width--) {
-    const int last_in_range = begin + range_width - 1;
-    const int selected = begin + random->Generate(range_width);
-    std::swap((*v)[selected], (*v)[last_in_range]);
-  }
-}
-
-// Performs an in-place shuffle of the vector's elements.
-template <typename E>
-inline void Shuffle(internal::Random* random, std::vector<E>* v) {
-  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
-}
-
-// A function for deleting an object.  Handy for being used as a
-// functor.
-template <typename T>
-static void Delete(T* x) {
-  delete x;
-}
-
-// A predicate that checks the key of a TestProperty against a known key.
-//
-// TestPropertyKeyIs is copyable.
-class TestPropertyKeyIs {
- public:
-  // Constructor.
-  //
-  // TestPropertyKeyIs has NO default constructor.
-  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
-
-  // Returns true iff the test name of test property matches on key_.
-  bool operator()(const TestProperty& test_property) const {
-    return test_property.key() == key_;
-  }
-
- private:
-  std::string key_;
-};
-
-// Class UnitTestOptions.
-//
-// This class contains functions for processing options the user
-// specifies when running the tests.  It has only static members.
-//
-// In most cases, the user can specify an option using either an
-// environment variable or a command line flag.  E.g. you can set the
-// test filter using either GTEST_FILTER or --gtest_filter.  If both
-// the variable and the flag are present, the latter overrides the
-// former.
-class GTEST_API_ UnitTestOptions {
- public:
-  // Functions for processing the gtest_output flag.
-
-  // Returns the output format, or "" for normal printed output.
-  static std::string GetOutputFormat();
-
-  // Returns the absolute path of the requested output file, or the
-  // default (test_detail.xml in the original working directory) if
-  // none was explicitly specified.
-  static std::string GetAbsolutePathToOutputFile();
-
-  // Functions for processing the gtest_filter flag.
-
-  // Returns true iff the wildcard pattern matches the string.  The
-  // first ':' or '\0' character in pattern marks the end of it.
-  //
-  // This recursive algorithm isn't very efficient, but is clear and
-  // works well enough for matching test names, which are short.
-  static bool PatternMatchesString(const char *pattern, const char *str);
-
-  // Returns true iff the user-specified filter matches the test case
-  // name and the test name.
-  static bool FilterMatchesTest(const std::string &test_case_name,
-                                const std::string &test_name);
-
-#if GTEST_OS_WINDOWS
-  // Function for supporting the gtest_catch_exception flag.
-
-  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
-  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
-  // This function is useful as an __except condition.
-  static int GTestShouldProcessSEH(DWORD exception_code);
-#endif  // GTEST_OS_WINDOWS
-
-  // Returns true if "name" matches the ':' separated list of glob-style
-  // filters in "filter".
-  static bool MatchesFilter(const std::string& name, const char* filter);
-};
-
-// Returns the current application's name, removing directory path if that
-// is present.  Used by UnitTestOptions::GetOutputFile.
-GTEST_API_ FilePath GetCurrentExecutableName();
-
-// The role interface for getting the OS stack trace as a string.
-class OsStackTraceGetterInterface {
- public:
-  OsStackTraceGetterInterface() {}
-  virtual ~OsStackTraceGetterInterface() {}
-
-  // Returns the current OS stack trace as an std::string.  Parameters:
-  //
-  //   max_depth  - the maximum number of stack frames to be included
-  //                in the trace.
-  //   skip_count - the number of top frames to be skipped; doesn't count
-  //                against max_depth.
-  virtual string CurrentStackTrace(int max_depth, int skip_count) = 0;
-
-  // UponLeavingGTest() should be called immediately before Google Test calls
-  // user code. It saves some information about the current stack that
-  // CurrentStackTrace() will use to find and hide Google Test stack frames.
-  virtual void UponLeavingGTest() = 0;
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
-};
-
-// A working implementation of the OsStackTraceGetterInterface interface.
-class OsStackTraceGetter : public OsStackTraceGetterInterface {
- public:
-  OsStackTraceGetter() : caller_frame_(NULL) {}
-
-  virtual string CurrentStackTrace(int max_depth, int skip_count)
-      GTEST_LOCK_EXCLUDED_(mutex_);
-
-  virtual void UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_);
-
-  // This string is inserted in place of stack frames that are part of
-  // Google Test's implementation.
-  static const char* const kElidedFramesMarker;
-
- private:
-  Mutex mutex_;  // protects all internal state
-
-  // We save the stack frame below the frame that calls user code.
-  // We do this because the address of the frame immediately below
-  // the user code changes between the call to UponLeavingGTest()
-  // and any calls to CurrentStackTrace() from within the user code.
-  void* caller_frame_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
-};
-
-// Information about a Google Test trace point.
-struct TraceInfo {
-  const char* file;
-  int line;
-  std::string message;
-};
-
-// This is the default global test part result reporter used in UnitTestImpl.
-// This class should only be used by UnitTestImpl.
-class DefaultGlobalTestPartResultReporter
-  : public TestPartResultReporterInterface {
- public:
-  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
-  // Implements the TestPartResultReporterInterface. Reports the test part
-  // result in the current test.
-  virtual void ReportTestPartResult(const TestPartResult& result);
-
- private:
-  UnitTestImpl* const unit_test_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
-};
-
-// This is the default per thread test part result reporter used in
-// UnitTestImpl. This class should only be used by UnitTestImpl.
-class DefaultPerThreadTestPartResultReporter
-    : public TestPartResultReporterInterface {
- public:
-  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
-  // Implements the TestPartResultReporterInterface. The implementation just
-  // delegates to the current global test part result reporter of *unit_test_.
-  virtual void ReportTestPartResult(const TestPartResult& result);
-
- private:
-  UnitTestImpl* const unit_test_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
-};
-
-// The private implementation of the UnitTest class.  We don't protect
-// the methods under a mutex, as this class is not accessible by a
-// user and the UnitTest class that delegates work to this class does
-// proper locking.
-class GTEST_API_ UnitTestImpl {
- public:
-  explicit UnitTestImpl(UnitTest* parent);
-  virtual ~UnitTestImpl();
-
-  // There are two different ways to register your own TestPartResultReporter.
-  // You can register your own repoter to listen either only for test results
-  // from the current thread or for results from all threads.
-  // By default, each per-thread test result repoter just passes a new
-  // TestPartResult to the global test result reporter, which registers the
-  // test part result for the currently running test.
-
-  // Returns the global test part result reporter.
-  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
-
-  // Sets the global test part result reporter.
-  void SetGlobalTestPartResultReporter(
-      TestPartResultReporterInterface* reporter);
-
-  // Returns the test part result reporter for the current thread.
-  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
-
-  // Sets the test part result reporter for the current thread.
-  void SetTestPartResultReporterForCurrentThread(
-      TestPartResultReporterInterface* reporter);
-
-  // Gets the number of successful test cases.
-  int successful_test_case_count() const;
-
-  // Gets the number of failed test cases.
-  int failed_test_case_count() const;
-
-  // Gets the number of all test cases.
-  int total_test_case_count() const;
-
-  // Gets the number of all test cases that contain at least one test
-  // that should run.
-  int test_case_to_run_count() const;
-
-  // Gets the number of successful tests.
-  int successful_test_count() const;
-
-  // Gets the number of failed tests.
-  int failed_test_count() const;
-
-  // Gets the number of disabled tests that will be reported in the XML report.
-  int reportable_disabled_test_count() const;
-
-  // Gets the number of disabled tests.
-  int disabled_test_count() const;
-
-  // Gets the number of tests to be printed in the XML report.
-  int reportable_test_count() const;
-
-  // Gets the number of all tests.
-  int total_test_count() const;
-
-  // Gets the number of tests that should run.
-  int test_to_run_count() const;
-
-  // Gets the time of the test program start, in ms from the start of the
-  // UNIX epoch.
-  TimeInMillis start_timestamp() const { return start_timestamp_; }
-
-  // Gets the elapsed time, in milliseconds.
-  TimeInMillis elapsed_time() const { return elapsed_time_; }
-
-  // Returns true iff the unit test passed (i.e. all test cases passed).
-  bool Passed() const { return !Failed(); }
-
-  // Returns true iff the unit test failed (i.e. some test case failed
-  // or something outside of all tests failed).
-  bool Failed() const {
-    return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed();
-  }
-
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  const TestCase* GetTestCase(int i) const {
-    const int index = GetElementOr(test_case_indices_, i, -1);
-    return index < 0 ? NULL : test_cases_[i];
-  }
-
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  TestCase* GetMutableTestCase(int i) {
-    const int index = GetElementOr(test_case_indices_, i, -1);
-    return index < 0 ? NULL : test_cases_[index];
-  }
-
-  // Provides access to the event listener list.
-  TestEventListeners* listeners() { return &listeners_; }
-
-  // Returns the TestResult for the test that's currently running, or
-  // the TestResult for the ad hoc test if no test is running.
-  TestResult* current_test_result();
-
-  // Returns the TestResult for the ad hoc test.
-  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
-
-  // Sets the OS stack trace getter.
-  //
-  // Does nothing if the input and the current OS stack trace getter
-  // are the same; otherwise, deletes the old getter and makes the
-  // input the current getter.
-  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
-
-  // Returns the current OS stack trace getter if it is not NULL;
-  // otherwise, creates an OsStackTraceGetter, makes it the current
-  // getter, and returns it.
-  OsStackTraceGetterInterface* os_stack_trace_getter();
-
-  // Returns the current OS stack trace as an std::string.
-  //
-  // The maximum number of stack frames to be included is specified by
-  // the gtest_stack_trace_depth flag.  The skip_count parameter
-  // specifies the number of top frames to be skipped, which doesn't
-  // count against the number of frames to be included.
-  //
-  // For example, if Foo() calls Bar(), which in turn calls
-  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
-  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
-
-  // Finds and returns a TestCase with the given name.  If one doesn't
-  // exist, creates one and returns it.
-  //
-  // Arguments:
-  //
-  //   test_case_name: name of the test case
-  //   type_param:     the name of the test's type parameter, or NULL if
-  //                   this is not a typed or a type-parameterized test.
-  //   set_up_tc:      pointer to the function that sets up the test case
-  //   tear_down_tc:   pointer to the function that tears down the test case
-  TestCase* GetTestCase(const char* test_case_name,
-                        const char* type_param,
-                        Test::SetUpTestCaseFunc set_up_tc,
-                        Test::TearDownTestCaseFunc tear_down_tc);
-
-  // Adds a TestInfo to the unit test.
-  //
-  // Arguments:
-  //
-  //   set_up_tc:    pointer to the function that sets up the test case
-  //   tear_down_tc: pointer to the function that tears down the test case
-  //   test_info:    the TestInfo object
-  void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
-                   Test::TearDownTestCaseFunc tear_down_tc,
-                   TestInfo* test_info) {
-    // In order to support thread-safe death tests, we need to
-    // remember the original working directory when the test program
-    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
-    // the user may have changed the current directory before calling
-    // RUN_ALL_TESTS().  Therefore we capture the current directory in
-    // AddTestInfo(), which is called to register a TEST or TEST_F
-    // before main() is reached.
-    if (original_working_dir_.IsEmpty()) {
-      original_working_dir_.Set(FilePath::GetCurrentDir());
-      GTEST_CHECK_(!original_working_dir_.IsEmpty())
-          << "Failed to get the current working directory.";
-    }
-
-    GetTestCase(test_info->test_case_name(),
-                test_info->type_param(),
-                set_up_tc,
-                tear_down_tc)->AddTestInfo(test_info);
-  }
-
-#if GTEST_HAS_PARAM_TEST
-  // Returns ParameterizedTestCaseRegistry object used to keep track of
-  // value-parameterized tests and instantiate and register them.
-  internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
-    return parameterized_test_registry_;
-  }
-#endif  // GTEST_HAS_PARAM_TEST
-
-  // Sets the TestCase object for the test that's currently running.
-  void set_current_test_case(TestCase* a_current_test_case) {
-    current_test_case_ = a_current_test_case;
-  }
-
-  // Sets the TestInfo object for the test that's currently running.  If
-  // current_test_info is NULL, the assertion results will be stored in
-  // ad_hoc_test_result_.
-  void set_current_test_info(TestInfo* a_current_test_info) {
-    current_test_info_ = a_current_test_info;
-  }
-
-  // Registers all parameterized tests defined using TEST_P and
-  // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter
-  // combination. This method can be called more then once; it has guards
-  // protecting from registering the tests more then once.  If
-  // value-parameterized tests are disabled, RegisterParameterizedTests is
-  // present but does nothing.
-  void RegisterParameterizedTests();
-
-  // Runs all tests in this UnitTest object, prints the result, and
-  // returns true if all tests are successful.  If any exception is
-  // thrown during a test, this test is considered to be failed, but
-  // the rest of the tests will still be run.
-  bool RunAllTests();
-
-  // Clears the results of all tests, except the ad hoc tests.
-  void ClearNonAdHocTestResult() {
-    ForEach(test_cases_, TestCase::ClearTestCaseResult);
-  }
-
-  // Clears the results of ad-hoc test assertions.
-  void ClearAdHocTestResult() {
-    ad_hoc_test_result_.Clear();
-  }
-
-  // Adds a TestProperty to the current TestResult object when invoked in a
-  // context of a test or a test case, or to the global property set. If the
-  // result already contains a property with the same key, the value will be
-  // updated.
-  void RecordProperty(const TestProperty& test_property);
-
-  enum ReactionToSharding {
-    HONOR_SHARDING_PROTOCOL,
-    IGNORE_SHARDING_PROTOCOL
-  };
-
-  // Matches the full name of each test against the user-specified
-  // filter to decide whether the test should run, then records the
-  // result in each TestCase and TestInfo object.
-  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
-  // based on sharding variables in the environment.
-  // Returns the number of tests that should run.
-  int FilterTests(ReactionToSharding shard_tests);
-
-  // Prints the names of the tests matching the user-specified filter flag.
-  void ListTestsMatchingFilter();
-
-  const TestCase* current_test_case() const { return current_test_case_; }
-  TestInfo* current_test_info() { return current_test_info_; }
-  const TestInfo* current_test_info() const { return current_test_info_; }
-
-  // Returns the vector of environments that need to be set-up/torn-down
-  // before/after the tests are run.
-  std::vector<Environment*>& environments() { return environments_; }
-
-  // Getters for the per-thread Google Test trace stack.
-  std::vector<TraceInfo>& gtest_trace_stack() {
-    return *(gtest_trace_stack_.pointer());
-  }
-  const std::vector<TraceInfo>& gtest_trace_stack() const {
-    return gtest_trace_stack_.get();
-  }
-
-#if GTEST_HAS_DEATH_TEST
-  void InitDeathTestSubprocessControlInfo() {
-    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
-  }
-  // Returns a pointer to the parsed --gtest_internal_run_death_test
-  // flag, or NULL if that flag was not specified.
-  // This information is useful only in a death test child process.
-  // Must not be called before a call to InitGoogleTest.
-  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
-    return internal_run_death_test_flag_.get();
-  }
-
-  // Returns a pointer to the current death test factory.
-  internal::DeathTestFactory* death_test_factory() {
-    return death_test_factory_.get();
-  }
-
-  void SuppressTestEventsIfInSubprocess();
-
-  friend class ReplaceDeathTestFactory;
-#endif  // GTEST_HAS_DEATH_TEST
-
-  // Initializes the event listener performing XML output as specified by
-  // UnitTestOptions. Must not be called before InitGoogleTest.
-  void ConfigureXmlOutput();
-
-#if GTEST_CAN_STREAM_RESULTS_
-  // Initializes the event listener for streaming test results to a socket.
-  // Must not be called before InitGoogleTest.
-  void ConfigureStreamingOutput();
-#endif
-
-  // Performs initialization dependent upon flag values obtained in
-  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
-  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
-  // this function is also called from RunAllTests.  Since this function can be
-  // called more than once, it has to be idempotent.
-  void PostFlagParsingInit();
-
-  // Gets the random seed used at the start of the current test iteration.
-  int random_seed() const { return random_seed_; }
-
-  // Gets the random number generator.
-  internal::Random* random() { return &random_; }
-
-  // Shuffles all test cases, and the tests within each test case,
-  // making sure that death tests are still run first.
-  void ShuffleTests();
-
-  // Restores the test cases and tests to their order before the first shuffle.
-  void UnshuffleTests();
-
-  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
-  // UnitTest::Run() starts.
-  bool catch_exceptions() const { return catch_exceptions_; }
-
- private:
-  friend class ::testing::UnitTest;
-
-  // Used by UnitTest::Run() to capture the state of
-  // GTEST_FLAG(catch_exceptions) at the moment it starts.
-  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
-
-  // The UnitTest object that owns this implementation object.
-  UnitTest* const parent_;
-
-  // The working directory when the first TEST() or TEST_F() was
-  // executed.
-  internal::FilePath original_working_dir_;
-
-  // The default test part result reporters.
-  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
-  DefaultPerThreadTestPartResultReporter
-      default_per_thread_test_part_result_reporter_;
-
-  // Points to (but doesn't own) the global test part result reporter.
-  TestPartResultReporterInterface* global_test_part_result_repoter_;
-
-  // Protects read and write access to global_test_part_result_reporter_.
-  internal::Mutex global_test_part_result_reporter_mutex_;
-
-  // Points to (but doesn't own) the per-thread test part result reporter.
-  internal::ThreadLocal<TestPartResultReporterInterface*>
-      per_thread_test_part_result_reporter_;
-
-  // The vector of environments that need to be set-up/torn-down
-  // before/after the tests are run.
-  std::vector<Environment*> environments_;
-
-  // The vector of TestCases in their original order.  It owns the
-  // elements in the vector.
-  std::vector<TestCase*> test_cases_;
-
-  // Provides a level of indirection for the test case list to allow
-  // easy shuffling and restoring the test case order.  The i-th
-  // element of this vector is the index of the i-th test case in the
-  // shuffled order.
-  std::vector<int> test_case_indices_;
-
-#if GTEST_HAS_PARAM_TEST
-  // ParameterizedTestRegistry object used to register value-parameterized
-  // tests.
-  internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
-
-  // Indicates whether RegisterParameterizedTests() has been called already.
-  bool parameterized_tests_registered_;
-#endif  // GTEST_HAS_PARAM_TEST
-
-  // Index of the last death test case registered.  Initially -1.
-  int last_death_test_case_;
-
-  // This points to the TestCase for the currently running test.  It
-  // changes as Google Test goes through one test case after another.
-  // When no test is running, this is set to NULL and Google Test
-  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
-  TestCase* current_test_case_;
-
-  // This points to the TestInfo for the currently running test.  It
-  // changes as Google Test goes through one test after another.  When
-  // no test is running, this is set to NULL and Google Test stores
-  // assertion results in ad_hoc_test_result_.  Initially NULL.
-  TestInfo* current_test_info_;
-
-  // Normally, a user only writes assertions inside a TEST or TEST_F,
-  // or inside a function called by a TEST or TEST_F.  Since Google
-  // Test keeps track of which test is current running, it can
-  // associate such an assertion with the test it belongs to.
-  //
-  // If an assertion is encountered when no TEST or TEST_F is running,
-  // Google Test attributes the assertion result to an imaginary "ad hoc"
-  // test, and records the result in ad_hoc_test_result_.
-  TestResult ad_hoc_test_result_;
-
-  // The list of event listeners that can be used to track events inside
-  // Google Test.
-  TestEventListeners listeners_;
-
-  // The OS stack trace getter.  Will be deleted when the UnitTest
-  // object is destructed.  By default, an OsStackTraceGetter is used,
-  // but the user can set this field to use a custom getter if that is
-  // desired.
-  OsStackTraceGetterInterface* os_stack_trace_getter_;
-
-  // True iff PostFlagParsingInit() has been called.
-  bool post_flag_parse_init_performed_;
-
-  // The random number seed used at the beginning of the test run.
-  int random_seed_;
-
-  // Our random number generator.
-  internal::Random random_;
-
-  // The time of the test program start, in ms from the start of the
-  // UNIX epoch.
-  TimeInMillis start_timestamp_;
-
-  // How long the test took to run, in milliseconds.
-  TimeInMillis elapsed_time_;
-
-#if GTEST_HAS_DEATH_TEST
-  // The decomposed components of the gtest_internal_run_death_test flag,
-  // parsed when RUN_ALL_TESTS is called.
-  internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
-  internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_;
-#endif  // GTEST_HAS_DEATH_TEST
-
-  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
-  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
-
-  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
-  // starts.
-  bool catch_exceptions_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
-};  // class UnitTestImpl
-
-// Convenience function for accessing the global UnitTest
-// implementation object.
-inline UnitTestImpl* GetUnitTestImpl() {
-  return UnitTest::GetInstance()->impl();
-}
-
-#if GTEST_USES_SIMPLE_RE
-
-// Internal helper functions for implementing the simple regular
-// expression matcher.
-GTEST_API_ bool IsInSet(char ch, const char* str);
-GTEST_API_ bool IsAsciiDigit(char ch);
-GTEST_API_ bool IsAsciiPunct(char ch);
-GTEST_API_ bool IsRepeat(char ch);
-GTEST_API_ bool IsAsciiWhiteSpace(char ch);
-GTEST_API_ bool IsAsciiWordChar(char ch);
-GTEST_API_ bool IsValidEscape(char ch);
-GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
-GTEST_API_ bool ValidateRegex(const char* regex);
-GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
-GTEST_API_ bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char ch, char repeat, const char* regex, const char* str);
-GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
-
-#endif  // GTEST_USES_SIMPLE_RE
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.
-GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
-GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
-
-#if GTEST_HAS_DEATH_TEST
-
-// Returns the message describing the last system error, regardless of the
-// platform.
-GTEST_API_ std::string GetLastErrnoDescription();
-
-# if GTEST_OS_WINDOWS
-// Provides leak-safe Windows kernel handle ownership.
-class AutoHandle {
- public:
-  AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
-  explicit AutoHandle(HANDLE handle) : handle_(handle) {}
-
-  ~AutoHandle() { Reset(); }
-
-  HANDLE Get() const { return handle_; }
-  void Reset() { Reset(INVALID_HANDLE_VALUE); }
-  void Reset(HANDLE handle) {
-    if (handle != handle_) {
-      if (handle_ != INVALID_HANDLE_VALUE)
-        ::CloseHandle(handle_);
-      handle_ = handle;
-    }
-  }
-
- private:
-  HANDLE handle_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
-};
-# endif  // GTEST_OS_WINDOWS
-
-// Attempts to parse a string into a positive integer pointed to by the
-// number parameter.  Returns true if that is possible.
-// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
-// it here.
-template <typename Integer>
-bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
-  // Fail fast if the given string does not begin with a digit;
-  // this bypasses strtoXXX's "optional leading whitespace and plus
-  // or minus sign" semantics, which are undesirable here.
-  if (str.empty() || !IsDigit(str[0])) {
-    return false;
-  }
-  errno = 0;
-
-  char* end;
-  // BiggestConvertible is the largest integer type that system-provided
-  // string-to-number conversion routines can return.
-
-# if GTEST_OS_WINDOWS && !defined(__GNUC__)
-
-  // MSVC and C++ Builder define __int64 instead of the standard long long.
-  typedef unsigned __int64 BiggestConvertible;
-  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
-
-# else
-
-  typedef unsigned long long BiggestConvertible;  // NOLINT
-  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
-
-# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
-
-  const bool parse_success = *end == '\0' && errno == 0;
-
-  // TODO(vladl@google.com): Convert this to compile time assertion when it is
-  // available.
-  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
-
-  const Integer result = static_cast<Integer>(parsed);
-  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
-    *number = result;
-    return true;
-  }
-  return false;
-}
-#endif  // GTEST_HAS_DEATH_TEST
-
-// TestResult contains some private methods that should be hidden from
-// Google Test user but are required for testing. This class allow our tests
-// to access them.
-//
-// This class is supplied only for the purpose of testing Google Test's own
-// constructs. Do not use it in user tests, either directly or indirectly.
-class TestResultAccessor {
- public:
-  static void RecordProperty(TestResult* test_result,
-                             const std::string& xml_element,
-                             const TestProperty& property) {
-    test_result->RecordProperty(xml_element, property);
-  }
-
-  static void ClearTestPartResults(TestResult* test_result) {
-    test_result->ClearTestPartResults();
-  }
-
-  static const std::vector<testing::TestPartResult>& test_part_results(
-      const TestResult& test_result) {
-    return test_result.test_part_results();
-  }
-};
-
-#if GTEST_CAN_STREAM_RESULTS_
-
-// Streams test results to the given port on the given host machine.
-class StreamingListener : public EmptyTestEventListener {
- public:
-  // Abstract base class for writing strings to a socket.
-  class AbstractSocketWriter {
-   public:
-    virtual ~AbstractSocketWriter() {}
-
-    // Sends a string to the socket.
-    virtual void Send(const string& message) = 0;
-
-    // Closes the socket.
-    virtual void CloseConnection() {}
-
-    // Sends a string and a newline to the socket.
-    void SendLn(const string& message) {
-      Send(message + "\n");
-    }
-  };
-
-  // Concrete class for actually writing strings to a socket.
-  class SocketWriter : public AbstractSocketWriter {
-   public:
-    SocketWriter(const string& host, const string& port)
-        : sockfd_(-1), host_name_(host), port_num_(port) {
-      MakeConnection();
-    }
-
-    virtual ~SocketWriter() {
-      if (sockfd_ != -1)
-        CloseConnection();
-    }
-
-    // Sends a string to the socket.
-    virtual void Send(const string& message) {
-      GTEST_CHECK_(sockfd_ != -1)
-          << "Send() can be called only when there is a connection.";
-
-      const int len = static_cast<int>(message.length());
-      if (write(sockfd_, message.c_str(), len) != len) {
-        GTEST_LOG_(WARNING)
-            << "stream_result_to: failed to stream to "
-            << host_name_ << ":" << port_num_;
-      }
-    }
-
-   private:
-    // Creates a client socket and connects to the server.
-    void MakeConnection();
-
-    // Closes the socket.
-    void CloseConnection() {
-      GTEST_CHECK_(sockfd_ != -1)
-          << "CloseConnection() can be called only when there is a connection.";
-
-      close(sockfd_);
-      sockfd_ = -1;
-    }
-
-    int sockfd_;  // socket file descriptor
-    const string host_name_;
-    const string port_num_;
-
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
-  };  // class SocketWriter
-
-  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
-  static string UrlEncode(const char* str);
-
-  StreamingListener(const string& host, const string& port)
-      : socket_writer_(new SocketWriter(host, port)) { Start(); }
-
-  explicit StreamingListener(AbstractSocketWriter* socket_writer)
-      : socket_writer_(socket_writer) { Start(); }
-
-  void OnTestProgramStart(const UnitTest& /* unit_test */) {
-    SendLn("event=TestProgramStart");
-  }
-
-  void OnTestProgramEnd(const UnitTest& unit_test) {
-    // Note that Google Test current only report elapsed time for each
-    // test iteration, not for the entire test program.
-    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
-
-    // Notify the streaming server to stop.
-    socket_writer_->CloseConnection();
-  }
-
-  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
-    SendLn("event=TestIterationStart&iteration=" +
-           StreamableToString(iteration));
-  }
-
-  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
-    SendLn("event=TestIterationEnd&passed=" +
-           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
-           StreamableToString(unit_test.elapsed_time()) + "ms");
-  }
-
-  void OnTestCaseStart(const TestCase& test_case) {
-    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
-  }
-
-  void OnTestCaseEnd(const TestCase& test_case) {
-    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed())
-           + "&elapsed_time=" + StreamableToString(test_case.elapsed_time())
-           + "ms");
-  }
-
-  void OnTestStart(const TestInfo& test_info) {
-    SendLn(std::string("event=TestStart&name=") + test_info.name());
-  }
-
-  void OnTestEnd(const TestInfo& test_info) {
-    SendLn("event=TestEnd&passed=" +
-           FormatBool((test_info.result())->Passed()) +
-           "&elapsed_time=" +
-           StreamableToString((test_info.result())->elapsed_time()) + "ms");
-  }
-
-  void OnTestPartResult(const TestPartResult& test_part_result) {
-    const char* file_name = test_part_result.file_name();
-    if (file_name == NULL)
-      file_name = "";
-    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
-           "&line=" + StreamableToString(test_part_result.line_number()) +
-           "&message=" + UrlEncode(test_part_result.message()));
-  }
-
- private:
-  // Sends the given message and a newline to the socket.
-  void SendLn(const string& message) { socket_writer_->SendLn(message); }
-
-  // Called at the start of streaming to notify the receiver what
-  // protocol we are using.
-  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
-
-  string FormatBool(bool value) { return value ? "1" : "0"; }
-
-  const scoped_ptr<AbstractSocketWriter> socket_writer_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
-};  // class StreamingListener
-
-#endif  // GTEST_CAN_STREAM_RESULTS_
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
-#undef GTEST_IMPLEMENTATION_
-
-#if GTEST_OS_WINDOWS
-# define vsnprintf _vsnprintf
-#endif  // GTEST_OS_WINDOWS
-
-namespace testing {
-
-using internal::CountIf;
-using internal::ForEach;
-using internal::GetElementOr;
-using internal::Shuffle;
-
-// Constants.
-
-// A test whose test case name or test name matches this filter is
-// disabled and not run.
-static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
-
-// A test case whose name matches this filter is considered a death
-// test case and will be run before test cases whose name doesn't
-// match this filter.
-static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
-
-// A test filter that matches everything.
-static const char kUniversalFilter[] = "*";
-
-// The default output file for XML output.
-static const char kDefaultOutputFile[] = "test_detail.xml";
-
-// The environment variable name for the test shard index.
-static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
-// The environment variable name for the total number of test shards.
-static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
-// The environment variable name for the test shard status file.
-static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
-
-namespace internal {
-
-// The text used in failure messages to indicate the start of the
-// stack trace.
-const char kStackTraceMarker[] = "\nStack trace:\n";
-
-// g_help_flag is true iff the --help flag or an equivalent form is
-// specified on the command line.
-bool g_help_flag = false;
-
-}  // namespace internal
-
-static const char* GetDefaultFilter() {
-  return kUniversalFilter;
-}
-
-GTEST_DEFINE_bool_(
-    also_run_disabled_tests,
-    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
-    "Run disabled tests too, in addition to the tests normally being run.");
-
-GTEST_DEFINE_bool_(
-    break_on_failure,
-    internal::BoolFromGTestEnv("break_on_failure", false),
-    "True iff a failed assertion should be a debugger break-point.");
-
-GTEST_DEFINE_bool_(
-    catch_exceptions,
-    internal::BoolFromGTestEnv("catch_exceptions", true),
-    "True iff " GTEST_NAME_
-    " should catch exceptions and treat them as test failures.");
-
-GTEST_DEFINE_string_(
-    color,
-    internal::StringFromGTestEnv("color", "auto"),
-    "Whether to use colors in the output.  Valid values: yes, no, "
-    "and auto.  'auto' means to use colors if the output is "
-    "being sent to a terminal and the TERM environment variable "
-    "is set to a terminal type that supports colors.");
-
-GTEST_DEFINE_string_(
-    filter,
-    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
-    "A colon-separated list of glob (not regex) patterns "
-    "for filtering the tests to run, optionally followed by a "
-    "'-' and a : separated list of negative patterns (tests to "
-    "exclude).  A test is run if it matches one of the positive "
-    "patterns and does not match any of the negative patterns.");
-
-GTEST_DEFINE_bool_(list_tests, false,
-                   "List all tests without running them.");
-
-GTEST_DEFINE_string_(
-    output,
-    internal::StringFromGTestEnv("output", ""),
-    "A format (currently must be \"xml\"), optionally followed "
-    "by a colon and an output file name or directory. A directory "
-    "is indicated by a trailing pathname separator. "
-    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
-    "If a directory is specified, output files will be created "
-    "within that directory, with file-names based on the test "
-    "executable's name and, if necessary, made unique by adding "
-    "digits.");
-
-GTEST_DEFINE_bool_(
-    print_time,
-    internal::BoolFromGTestEnv("print_time", true),
-    "True iff " GTEST_NAME_
-    " should display elapsed time in text output.");
-
-GTEST_DEFINE_int32_(
-    random_seed,
-    internal::Int32FromGTestEnv("random_seed", 0),
-    "Random number seed to use when shuffling test orders.  Must be in range "
-    "[1, 99999], or 0 to use a seed based on the current time.");
-
-GTEST_DEFINE_int32_(
-    repeat,
-    internal::Int32FromGTestEnv("repeat", 1),
-    "How many times to repeat each test.  Specify a negative number "
-    "for repeating forever.  Useful for shaking out flaky tests.");
-
-GTEST_DEFINE_bool_(
-    show_internal_stack_frames, false,
-    "True iff " GTEST_NAME_ " should include internal stack frames when "
-    "printing test failure stack traces.");
-
-GTEST_DEFINE_bool_(
-    shuffle,
-    internal::BoolFromGTestEnv("shuffle", false),
-    "True iff " GTEST_NAME_
-    " should randomize tests' order on every run.");
-
-GTEST_DEFINE_int32_(
-    stack_trace_depth,
-    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
-    "The maximum number of stack frames to print when an "
-    "assertion fails.  The valid range is 0 through 100, inclusive.");
-
-GTEST_DEFINE_string_(
-    stream_result_to,
-    internal::StringFromGTestEnv("stream_result_to", ""),
-    "This flag specifies the host name and the port number on which to stream "
-    "test results. Example: \"localhost:555\". The flag is effective only on "
-    "Linux.");
-
-GTEST_DEFINE_bool_(
-    throw_on_failure,
-    internal::BoolFromGTestEnv("throw_on_failure", false),
-    "When this flag is specified, a failed assertion will throw an exception "
-    "if exceptions are enabled or exit the program with a non-zero code "
-    "otherwise.");
-
-namespace internal {
-
-// Generates a random number from [0, range), using a Linear
-// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
-// than kMaxRange.
-UInt32 Random::Generate(UInt32 range) {
-  // These constants are the same as are used in glibc's rand(3).
-  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
-
-  GTEST_CHECK_(range > 0)
-      << "Cannot generate a number in the range [0, 0).";
-  GTEST_CHECK_(range <= kMaxRange)
-      << "Generation of a number in [0, " << range << ") was requested, "
-      << "but this can only generate numbers in [0, " << kMaxRange << ").";
-
-  // Converting via modulus introduces a bit of downward bias, but
-  // it's simple, and a linear congruential generator isn't too good
-  // to begin with.
-  return state_ % range;
-}
-
-// GTestIsInitialized() returns true iff the user has initialized
-// Google Test.  Useful for catching the user mistake of not initializing
-// Google Test before calling RUN_ALL_TESTS().
-//
-// A user must call testing::InitGoogleTest() to initialize Google
-// Test.  g_init_gtest_count is set to the number of times
-// InitGoogleTest() has been called.  We don't protect this variable
-// under a mutex as it is only accessed in the main thread.
-GTEST_API_ int g_init_gtest_count = 0;
-static bool GTestIsInitialized() { return g_init_gtest_count != 0; }
-
-// Iterates over a vector of TestCases, keeping a running sum of the
-// results of calling a given int-returning method on each.
-// Returns the sum.
-static int SumOverTestCaseList(const std::vector<TestCase*>& case_list,
-                               int (TestCase::*method)() const) {
-  int sum = 0;
-  for (size_t i = 0; i < case_list.size(); i++) {
-    sum += (case_list[i]->*method)();
-  }
-  return sum;
-}
-
-// Returns true iff the test case passed.
-static bool TestCasePassed(const TestCase* test_case) {
-  return test_case->should_run() && test_case->Passed();
-}
-
-// Returns true iff the test case failed.
-static bool TestCaseFailed(const TestCase* test_case) {
-  return test_case->should_run() && test_case->Failed();
-}
-
-// Returns true iff test_case contains at least one test that should
-// run.
-static bool ShouldRunTestCase(const TestCase* test_case) {
-  return test_case->should_run();
-}
-
-// AssertHelper constructor.
-AssertHelper::AssertHelper(TestPartResult::Type type,
-                           const char* file,
-                           int line,
-                           const char* message)
-    : data_(new AssertHelperData(type, file, line, message)) {
-}
-
-AssertHelper::~AssertHelper() {
-  delete data_;
-}
-
-// Message assignment, for assertion streaming support.
-void AssertHelper::operator=(const Message& message) const {
-  UnitTest::GetInstance()->
-    AddTestPartResult(data_->type, data_->file, data_->line,
-                      AppendUserMessage(data_->message, message),
-                      UnitTest::GetInstance()->impl()
-                      ->CurrentOsStackTraceExceptTop(1)
-                      // Skips the stack frame for this function itself.
-                      );  // NOLINT
-}
-
-// Mutex for linked pointers.
-GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
-
-// Application pathname gotten in InitGoogleTest.
-std::string g_executable_path;
-
-// Returns the current application's name, removing directory path if that
-// is present.
-FilePath GetCurrentExecutableName() {
-  FilePath result;
-
-#if GTEST_OS_WINDOWS
-  result.Set(FilePath(g_executable_path).RemoveExtension("exe"));
-#else
-  result.Set(FilePath(g_executable_path));
-#endif  // GTEST_OS_WINDOWS
-
-  return result.RemoveDirectoryName();
-}
-
-// Functions for processing the gtest_output flag.
-
-// Returns the output format, or "" for normal printed output.
-std::string UnitTestOptions::GetOutputFormat() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL) return std::string("");
-
-  const char* const colon = strchr(gtest_output_flag, ':');
-  return (colon == NULL) ?
-      std::string(gtest_output_flag) :
-      std::string(gtest_output_flag, colon - gtest_output_flag);
-}
-
-// Returns the name of the requested output file, or the default if none
-// was explicitly specified.
-std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL)
-    return "";
-
-  const char* const colon = strchr(gtest_output_flag, ':');
-  if (colon == NULL)
-    return internal::FilePath::ConcatPaths(
-        internal::FilePath(
-            UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(kDefaultOutputFile)).string();
-
-  internal::FilePath output_name(colon + 1);
-  if (!output_name.IsAbsolutePath())
-    // TODO(wan@google.com): on Windows \some\path is not an absolute
-    // path (as its meaning depends on the current drive), yet the
-    // following logic for turning it into an absolute path is wrong.
-    // Fix it.
-    output_name = internal::FilePath::ConcatPaths(
-        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(colon + 1));
-
-  if (!output_name.IsDirectory())
-    return output_name.string();
-
-  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
-      output_name, internal::GetCurrentExecutableName(),
-      GetOutputFormat().c_str()));
-  return result.string();
-}
-
-// Returns true iff the wildcard pattern matches the string.  The
-// first ':' or '\0' character in pattern marks the end of it.
-//
-// This recursive algorithm isn't very efficient, but is clear and
-// works well enough for matching test names, which are short.
-bool UnitTestOptions::PatternMatchesString(const char *pattern,
-                                           const char *str) {
-  switch (*pattern) {
-    case '\0':
-    case ':':  // Either ':' or '\0' marks the end of the pattern.
-      return *str == '\0';
-    case '?':  // Matches any single character.
-      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
-    case '*':  // Matches any string (possibly empty) of characters.
-      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
-          PatternMatchesString(pattern + 1, str);
-    default:  // Non-special character.  Matches itself.
-      return *pattern == *str &&
-          PatternMatchesString(pattern + 1, str + 1);
-  }
-}
-
-bool UnitTestOptions::MatchesFilter(
-    const std::string& name, const char* filter) {
-  const char *cur_pattern = filter;
-  for (;;) {
-    if (PatternMatchesString(cur_pattern, name.c_str())) {
-      return true;
-    }
-
-    // Finds the next pattern in the filter.
-    cur_pattern = strchr(cur_pattern, ':');
-
-    // Returns if no more pattern can be found.
-    if (cur_pattern == NULL) {
-      return false;
-    }
-
-    // Skips the pattern separater (the ':' character).
-    cur_pattern++;
-  }
-}
-
-// Returns true iff the user-specified filter matches the test case
-// name and the test name.
-bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name,
-                                        const std::string &test_name) {
-  const std::string& full_name = test_case_name + "." + test_name.c_str();
-
-  // Split --gtest_filter at '-', if there is one, to separate into
-  // positive filter and negative filter portions
-  const char* const p = GTEST_FLAG(filter).c_str();
-  const char* const dash = strchr(p, '-');
-  std::string positive;
-  std::string negative;
-  if (dash == NULL) {
-    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
-    negative = "";
-  } else {
-    positive = std::string(p, dash);   // Everything up to the dash
-    negative = std::string(dash + 1);  // Everything after the dash
-    if (positive.empty()) {
-      // Treat '-test1' as the same as '*-test1'
-      positive = kUniversalFilter;
-    }
-  }
-
-  // A filter is a colon-separated list of patterns.  It matches a
-  // test if any pattern in it matches the test.
-  return (MatchesFilter(full_name, positive.c_str()) &&
-          !MatchesFilter(full_name, negative.c_str()));
-}
-
-#if GTEST_HAS_SEH
-// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
-// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
-// This function is useful as an __except condition.
-int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
-  // Google Test should handle a SEH exception if:
-  //   1. the user wants it to, AND
-  //   2. this is not a breakpoint exception, AND
-  //   3. this is not a C++ exception (VC++ implements them via SEH,
-  //      apparently).
-  //
-  // SEH exception code for C++ exceptions.
-  // (see http://support.microsoft.com/kb/185294 for more information).
-  const DWORD kCxxExceptionCode = 0xe06d7363;
-
-  bool should_handle = true;
-
-  if (!GTEST_FLAG(catch_exceptions))
-    should_handle = false;
-  else if (exception_code == EXCEPTION_BREAKPOINT)
-    should_handle = false;
-  else if (exception_code == kCxxExceptionCode)
-    should_handle = false;
-
-  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
-}
-#endif  // GTEST_HAS_SEH
-
-}  // namespace internal
-
-// The c'tor sets this object as the test part result reporter used by
-// Google Test.  The 'result' parameter specifies where to report the
-// results. Intercepts only failures from the current thread.
-ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
-    TestPartResultArray* result)
-    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
-      result_(result) {
-  Init();
-}
-
-// The c'tor sets this object as the test part result reporter used by
-// Google Test.  The 'result' parameter specifies where to report the
-// results.
-ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
-    InterceptMode intercept_mode, TestPartResultArray* result)
-    : intercept_mode_(intercept_mode),
-      result_(result) {
-  Init();
-}
-
-void ScopedFakeTestPartResultReporter::Init() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
-    old_reporter_ = impl->GetGlobalTestPartResultReporter();
-    impl->SetGlobalTestPartResultReporter(this);
-  } else {
-    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
-    impl->SetTestPartResultReporterForCurrentThread(this);
-  }
-}
-
-// The d'tor restores the test part result reporter used by Google Test
-// before.
-ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
-    impl->SetGlobalTestPartResultReporter(old_reporter_);
-  } else {
-    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
-  }
-}
-
-// Increments the test part result count and remembers the result.
-// This method is from the TestPartResultReporterInterface interface.
-void ScopedFakeTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  result_->Append(result);
-}
-
-namespace internal {
-
-// Returns the type ID of ::testing::Test.  We should always call this
-// instead of GetTypeId< ::testing::Test>() to get the type ID of
-// testing::Test.  This is to work around a suspected linker bug when
-// using Google Test as a framework on Mac OS X.  The bug causes
-// GetTypeId< ::testing::Test>() to return different values depending
-// on whether the call is from the Google Test framework itself or
-// from user test code.  GetTestTypeId() is guaranteed to always
-// return the same value, as it always calls GetTypeId<>() from the
-// gtest.cc, which is within the Google Test framework.
-TypeId GetTestTypeId() {
-  return GetTypeId<Test>();
-}
-
-// The value of GetTestTypeId() as seen from within the Google Test
-// library.  This is solely for testing GetTestTypeId().
-extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
-
-// This predicate-formatter checks that 'results' contains a test part
-// failure of the given type and that the failure message contains the
-// given substring.
-AssertionResult HasOneFailure(const char* /* results_expr */,
-                              const char* /* type_expr */,
-                              const char* /* substr_expr */,
-                              const TestPartResultArray& results,
-                              TestPartResult::Type type,
-                              const string& substr) {
-  const std::string expected(type == TestPartResult::kFatalFailure ?
-                        "1 fatal failure" :
-                        "1 non-fatal failure");
-  Message msg;
-  if (results.size() != 1) {
-    msg << "Expected: " << expected << "\n"
-        << "  Actual: " << results.size() << " failures";
-    for (int i = 0; i < results.size(); i++) {
-      msg << "\n" << results.GetTestPartResult(i);
-    }
-    return AssertionFailure() << msg;
-  }
-
-  const TestPartResult& r = results.GetTestPartResult(0);
-  if (r.type() != type) {
-    return AssertionFailure() << "Expected: " << expected << "\n"
-                              << "  Actual:\n"
-                              << r;
-  }
-
-  if (strstr(r.message(), substr.c_str()) == NULL) {
-    return AssertionFailure() << "Expected: " << expected << " containing \""
-                              << substr << "\"\n"
-                              << "  Actual:\n"
-                              << r;
-  }
-
-  return AssertionSuccess();
-}
-
-// The constructor of SingleFailureChecker remembers where to look up
-// test part results, what type of failure we expect, and what
-// substring the failure message should contain.
-SingleFailureChecker:: SingleFailureChecker(
-    const TestPartResultArray* results,
-    TestPartResult::Type type,
-    const string& substr)
-    : results_(results),
-      type_(type),
-      substr_(substr) {}
-
-// The destructor of SingleFailureChecker verifies that the given
-// TestPartResultArray contains exactly one failure that has the given
-// type and contains the given substring.  If that's not the case, a
-// non-fatal failure will be generated.
-SingleFailureChecker::~SingleFailureChecker() {
-  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
-}
-
-DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
-
-void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  unit_test_->current_test_result()->AddTestPartResult(result);
-  unit_test_->listeners()->repeater()->OnTestPartResult(result);
-}
-
-DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
-
-void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
-}
-
-// Returns the global test part result reporter.
-TestPartResultReporterInterface*
-UnitTestImpl::GetGlobalTestPartResultReporter() {
-  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
-  return global_test_part_result_repoter_;
-}
-
-// Sets the global test part result reporter.
-void UnitTestImpl::SetGlobalTestPartResultReporter(
-    TestPartResultReporterInterface* reporter) {
-  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
-  global_test_part_result_repoter_ = reporter;
-}
-
-// Returns the test part result reporter for the current thread.
-TestPartResultReporterInterface*
-UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
-  return per_thread_test_part_result_reporter_.get();
-}
-
-// Sets the test part result reporter for the current thread.
-void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
-    TestPartResultReporterInterface* reporter) {
-  per_thread_test_part_result_reporter_.set(reporter);
-}
-
-// Gets the number of successful test cases.
-int UnitTestImpl::successful_test_case_count() const {
-  return CountIf(test_cases_, TestCasePassed);
-}
-
-// Gets the number of failed test cases.
-int UnitTestImpl::failed_test_case_count() const {
-  return CountIf(test_cases_, TestCaseFailed);
-}
-
-// Gets the number of all test cases.
-int UnitTestImpl::total_test_case_count() const {
-  return static_cast<int>(test_cases_.size());
-}
-
-// Gets the number of all test cases that contain at least one test
-// that should run.
-int UnitTestImpl::test_case_to_run_count() const {
-  return CountIf(test_cases_, ShouldRunTestCase);
-}
-
-// Gets the number of successful tests.
-int UnitTestImpl::successful_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count);
-}
-
-// Gets the number of failed tests.
-int UnitTestImpl::failed_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
-}
-
-// Gets the number of disabled tests that will be reported in the XML report.
-int UnitTestImpl::reportable_disabled_test_count() const {
-  return SumOverTestCaseList(test_cases_,
-                             &TestCase::reportable_disabled_test_count);
-}
-
-// Gets the number of disabled tests.
-int UnitTestImpl::disabled_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
-}
-
-// Gets the number of tests to be printed in the XML report.
-int UnitTestImpl::reportable_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count);
-}
-
-// Gets the number of all tests.
-int UnitTestImpl::total_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
-}
-
-// Gets the number of tests that should run.
-int UnitTestImpl::test_to_run_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
-}
-
-// Returns the current OS stack trace as an std::string.
-//
-// The maximum number of stack frames to be included is specified by
-// the gtest_stack_trace_depth flag.  The skip_count parameter
-// specifies the number of top frames to be skipped, which doesn't
-// count against the number of frames to be included.
-//
-// For example, if Foo() calls Bar(), which in turn calls
-// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
-// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
-  (void)skip_count;
-  return "";
-}
-
-// Returns the current time in milliseconds.
-TimeInMillis GetTimeInMillis() {
-#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
-  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
-  // http://analogous.blogspot.com/2005/04/epoch.html
-  const TimeInMillis kJavaEpochToWinFileTimeDelta =
-    static_cast<TimeInMillis>(116444736UL) * 100000UL;
-  const DWORD kTenthMicrosInMilliSecond = 10000;
-
-  SYSTEMTIME now_systime;
-  FILETIME now_filetime;
-  ULARGE_INTEGER now_int64;
-  // TODO(kenton@google.com): Shouldn't this just use
-  //   GetSystemTimeAsFileTime()?
-  GetSystemTime(&now_systime);
-  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
-    now_int64.LowPart = now_filetime.dwLowDateTime;
-    now_int64.HighPart = now_filetime.dwHighDateTime;
-    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
-      kJavaEpochToWinFileTimeDelta;
-    return now_int64.QuadPart;
-  }
-  return 0;
-#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
-  __timeb64 now;
-
-# ifdef _MSC_VER
-
-  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
-  // (deprecated function) there.
-  // TODO(kenton@google.com): Use GetTickCount()?  Or use
-  //   SystemTimeToFileTime()
-#  pragma warning(push)          // Saves the current warning state.
-#  pragma warning(disable:4996)  // Temporarily disables warning 4996.
-  _ftime64(&now);
-#  pragma warning(pop)           // Restores the warning state.
-# else
-
-  _ftime64(&now);
-
-# endif  // _MSC_VER
-
-  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
-#elif GTEST_HAS_GETTIMEOFDAY_
-  struct timeval now;
-  gettimeofday(&now, NULL);
-  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
-#else
-# error "Don't know how to get the current time on your system."
-#endif
-}
-
-// Utilities
-
-// class String.
-
-#if GTEST_OS_WINDOWS_MOBILE
-// Creates a UTF-16 wide string from the given ANSI string, allocating
-// memory using new. The caller is responsible for deleting the return
-// value using delete[]. Returns the wide string, or NULL if the
-// input is NULL.
-LPCWSTR String::AnsiToUtf16(const char* ansi) {
-  if (!ansi) return NULL;
-  const int length = strlen(ansi);
-  const int unicode_length =
-      MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                          NULL, 0);
-  WCHAR* unicode = new WCHAR[unicode_length + 1];
-  MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                      unicode, unicode_length);
-  unicode[unicode_length] = 0;
-  return unicode;
-}
-
-// Creates an ANSI string from the given wide string, allocating
-// memory using new. The caller is responsible for deleting the return
-// value using delete[]. Returns the ANSI string, or NULL if the
-// input is NULL.
-const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
-  if (!utf16_str) return NULL;
-  const int ansi_length =
-      WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
-                          NULL, 0, NULL, NULL);
-  char* ansi = new char[ansi_length + 1];
-  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
-                      ansi, ansi_length, NULL, NULL);
-  ansi[ansi_length] = 0;
-  return ansi;
-}
-
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-// Compares two C strings.  Returns true iff they have the same content.
-//
-// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
-// C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::CStringEquals(const char * lhs, const char * rhs) {
-  if ( lhs == NULL ) return rhs == NULL;
-
-  if ( rhs == NULL ) return false;
-
-  return strcmp(lhs, rhs) == 0;
-}
-
-#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
-
-// Converts an array of wide chars to a narrow string using the UTF-8
-// encoding, and streams the result to the given Message object.
-static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
-                                     Message* msg) {
-  for (size_t i = 0; i != length; ) {  // NOLINT
-    if (wstr[i] != L'\0') {
-      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
-      while (i != length && wstr[i] != L'\0')
-        i++;
-    } else {
-      *msg << '\0';
-      i++;
-    }
-  }
-}
-
-#endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
-
-}  // namespace internal
-
-// Constructs an empty Message.
-// We allocate the stringstream separately because otherwise each use of
-// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
-// stack frame leading to huge stack frames in some cases; gcc does not reuse
-// the stack space.
-Message::Message() : ss_(new ::std::stringstream) {
-  // By default, we want there to be enough precision when printing
-  // a double to a Message.
-  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
-}
-
-// These two overloads allow streaming a wide C string to a Message
-// using the UTF-8 encoding.
-Message& Message::operator <<(const wchar_t* wide_c_str) {
-  return *this << internal::String::ShowWideCString(wide_c_str);
-}
-Message& Message::operator <<(wchar_t* wide_c_str) {
-  return *this << internal::String::ShowWideCString(wide_c_str);
-}
-
-#if GTEST_HAS_STD_WSTRING
-// Converts the given wide string to a narrow string using the UTF-8
-// encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::std::wstring& wstr) {
-  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
-  return *this;
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-#if GTEST_HAS_GLOBAL_WSTRING
-// Converts the given wide string to a narrow string using the UTF-8
-// encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::wstring& wstr) {
-  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
-  return *this;
-}
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-// Gets the text streamed to this object so far as an std::string.
-// Each '\0' character in the buffer is replaced with "\\0".
-std::string Message::GetString() const {
-  return internal::StringStreamToString(ss_.get());
-}
-
-// AssertionResult constructors.
-// Used in EXPECT_TRUE/FALSE(assertion_result).
-AssertionResult::AssertionResult(const AssertionResult& other)
-    : success_(other.success_),
-      message_(other.message_.get() != NULL ?
-               new ::std::string(*other.message_) :
-               static_cast< ::std::string*>(NULL)) {
-}
-
-// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-AssertionResult AssertionResult::operator!() const {
-  AssertionResult negation(!success_);
-  if (message_.get() != NULL)
-    negation << *message_;
-  return negation;
-}
-
-// Makes a successful assertion result.
-AssertionResult AssertionSuccess() {
-  return AssertionResult(true);
-}
-
-// Makes a failed assertion result.
-AssertionResult AssertionFailure() {
-  return AssertionResult(false);
-}
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << message.
-AssertionResult AssertionFailure(const Message& message) {
-  return AssertionFailure() << message;
-}
-
-namespace internal {
-
-// Constructs and returns the message for an equality assertion
-// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
-//
-// The first four parameters are the expressions used in the assertion
-// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
-// where foo is 5 and bar is 6, we have:
-//
-//   expected_expression: "foo"
-//   actual_expression:   "bar"
-//   expected_value:      "5"
-//   actual_value:        "6"
-//
-// The ignoring_case parameter is true iff the assertion is a
-// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
-// be inserted into the message.
-AssertionResult EqFailure(const char* expected_expression,
-                          const char* actual_expression,
-                          const std::string& expected_value,
-                          const std::string& actual_value,
-                          bool ignoring_case) {
-  Message msg;
-  msg << "Value of: " << actual_expression;
-  if (actual_value != actual_expression) {
-    msg << "\n  Actual: " << actual_value;
-  }
-
-  msg << "\nExpected: " << expected_expression;
-  if (ignoring_case) {
-    msg << " (ignoring case)";
-  }
-  if (expected_value != expected_expression) {
-    msg << "\nWhich is: " << expected_value;
-  }
-
-  return AssertionFailure() << msg;
-}
-
-// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
-std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value) {
-  const char* actual_message = assertion_result.message();
-  Message msg;
-  msg << "Value of: " << expression_text
-      << "\n  Actual: " << actual_predicate_value;
-  if (actual_message[0] != '\0')
-    msg << " (" << actual_message << ")";
-  msg << "\nExpected: " << expected_predicate_value;
-  return msg.GetString();
-}
-
-// Helper function for implementing ASSERT_NEAR.
-AssertionResult DoubleNearPredFormat(const char* expr1,
-                                     const char* expr2,
-                                     const char* abs_error_expr,
-                                     double val1,
-                                     double val2,
-                                     double abs_error) {
-  const double diff = fabs(val1 - val2);
-  if (diff <= abs_error) return AssertionSuccess();
-
-  // TODO(wan): do not print the value of an expression if it's
-  // already a literal.
-  return AssertionFailure()
-      << "The difference between " << expr1 << " and " << expr2
-      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
-      << expr1 << " evaluates to " << val1 << ",\n"
-      << expr2 << " evaluates to " << val2 << ", and\n"
-      << abs_error_expr << " evaluates to " << abs_error << ".";
-}
-
-
-// Helper template for implementing FloatLE() and DoubleLE().
-template <typename RawType>
-AssertionResult FloatingPointLE(const char* expr1,
-                                const char* expr2,
-                                RawType val1,
-                                RawType val2) {
-  // Returns success if val1 is less than val2,
-  if (val1 < val2) {
-    return AssertionSuccess();
-  }
-
-  // or if val1 is almost equal to val2.
-  const FloatingPoint<RawType> lhs(val1), rhs(val2);
-  if (lhs.AlmostEquals(rhs)) {
-    return AssertionSuccess();
-  }
-
-  // Note that the above two checks will both fail if either val1 or
-  // val2 is NaN, as the IEEE floating-point standard requires that
-  // any predicate involving a NaN must return false.
-
-  ::std::stringstream val1_ss;
-  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-          << val1;
-
-  ::std::stringstream val2_ss;
-  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-          << val2;
-
-  return AssertionFailure()
-      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
-      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
-      << StringStreamToString(&val2_ss);
-}
-
-}  // namespace internal
-
-// Asserts that val1 is less than, or almost equal to, val2.  Fails
-// otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult FloatLE(const char* expr1, const char* expr2,
-                        float val1, float val2) {
-  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
-}
-
-// Asserts that val1 is less than, or almost equal to, val2.  Fails
-// otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult DoubleLE(const char* expr1, const char* expr2,
-                         double val1, double val2) {
-  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
-}
-
-namespace internal {
-
-// The helper function for {ASSERT|EXPECT}_EQ with int or enum
-// arguments.
-AssertionResult CmpHelperEQ(const char* expected_expression,
-                            const char* actual_expression,
-                            BiggestInt expected,
-                            BiggestInt actual) {
-  if (expected == actual) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   FormatForComparisonFailureMessage(expected, actual),
-                   FormatForComparisonFailureMessage(actual, expected),
-                   false);
-}
-
-// A macro for implementing the helper functions needed to implement
-// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
-// just to avoid copy-and-paste of similar code.
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   BiggestInt val1, BiggestInt val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return AssertionFailure() \
-        << "Expected: (" << expr1 << ") " #op " (" << expr2\
-        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
-        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
-  }\
-}
-
-// Implements the helper function for {ASSERT|EXPECT}_NE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(NE, !=)
-// Implements the helper function for {ASSERT|EXPECT}_LE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LE, <=)
-// Implements the helper function for {ASSERT|EXPECT}_LT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LT, < )
-// Implements the helper function for {ASSERT|EXPECT}_GE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GE, >=)
-// Implements the helper function for {ASSERT|EXPECT}_GT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GT, > )
-
-#undef GTEST_IMPL_CMP_HELPER_
-
-// The helper function for {ASSERT|EXPECT}_STREQ.
-AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                               const char* actual_expression,
-                               const char* expected,
-                               const char* actual) {
-  if (String::CStringEquals(expected, actual)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   PrintToString(expected),
-                   PrintToString(actual),
-                   false);
-}
-
-// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
-AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
-                                   const char* actual_expression,
-                                   const char* expected,
-                                   const char* actual) {
-  if (String::CaseInsensitiveCStringEquals(expected, actual)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   PrintToString(expected),
-                   PrintToString(actual),
-                   true);
-}
-
-// The helper function for {ASSERT|EXPECT}_STRNE.
-AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const char* s1,
-                               const char* s2) {
-  if (!String::CStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  } else {
-    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                              << s2_expression << "), actual: \""
-                              << s1 << "\" vs \"" << s2 << "\"";
-  }
-}
-
-// The helper function for {ASSERT|EXPECT}_STRCASENE.
-AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
-                                   const char* s2_expression,
-                                   const char* s1,
-                                   const char* s2) {
-  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  } else {
-    return AssertionFailure()
-        << "Expected: (" << s1_expression << ") != ("
-        << s2_expression << ") (ignoring case), actual: \""
-        << s1 << "\" vs \"" << s2 << "\"";
-  }
-}
-
-}  // namespace internal
-
-namespace {
-
-// Helper functions for implementing IsSubString() and IsNotSubstring().
-
-// This group of overloaded functions return true iff needle is a
-// substring of haystack.  NULL is considered a substring of itself
-// only.
-
-bool IsSubstringPred(const char* needle, const char* haystack) {
-  if (needle == NULL || haystack == NULL)
-    return needle == haystack;
-
-  return strstr(haystack, needle) != NULL;
-}
-
-bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
-  if (needle == NULL || haystack == NULL)
-    return needle == haystack;
-
-  return wcsstr(haystack, needle) != NULL;
-}
-
-// StringType here can be either ::std::string or ::std::wstring.
-template <typename StringType>
-bool IsSubstringPred(const StringType& needle,
-                     const StringType& haystack) {
-  return haystack.find(needle) != StringType::npos;
-}
-
-// This function implements either IsSubstring() or IsNotSubstring(),
-// depending on the value of the expected_to_be_substring parameter.
-// StringType here can be const char*, const wchar_t*, ::std::string,
-// or ::std::wstring.
-template <typename StringType>
-AssertionResult IsSubstringImpl(
-    bool expected_to_be_substring,
-    const char* needle_expr, const char* haystack_expr,
-    const StringType& needle, const StringType& haystack) {
-  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
-    return AssertionSuccess();
-
-  const bool is_wide_string = sizeof(needle[0]) > 1;
-  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
-  return AssertionFailure()
-      << "Value of: " << needle_expr << "\n"
-      << "  Actual: " << begin_string_quote << needle << "\"\n"
-      << "Expected: " << (expected_to_be_substring ? "" : "not ")
-      << "a substring of " << haystack_expr << "\n"
-      << "Which is: " << begin_string_quote << haystack << "\"";
-}
-
-}  // namespace
-
-// IsSubstring() and IsNotSubstring() check whether needle is a
-// substring of haystack (NULL is considered a substring of itself
-// only), and return an appropriate error message when they fail.
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-#if GTEST_HAS_STD_WSTRING
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-namespace internal {
-
-#if GTEST_OS_WINDOWS
-
-namespace {
-
-// Helper function for IsHRESULT{SuccessFailure} predicates
-AssertionResult HRESULTFailureHelper(const char* expr,
-                                     const char* expected,
-                                     long hr) {  // NOLINT
-# if GTEST_OS_WINDOWS_MOBILE
-
-  // Windows CE doesn't support FormatMessage.
-  const char error_text[] = "";
-
-# else
-
-  // Looks up the human-readable system message for the HRESULT code
-  // and since we're not passing any params to FormatMessage, we don't
-  // want inserts expanded.
-  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
-                       FORMAT_MESSAGE_IGNORE_INSERTS;
-  const DWORD kBufSize = 4096;
-  // Gets the system's human readable message string for this HRESULT.
-  char error_text[kBufSize] = { '\0' };
-  DWORD message_length = ::FormatMessageA(kFlags,
-                                          0,  // no source, we're asking system
-                                          hr,  // the error
-                                          0,  // no line width restrictions
-                                          error_text,  // output buffer
-                                          kBufSize,  // buf size
-                                          NULL);  // no arguments for inserts
-  // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
-  for (; message_length && IsSpace(error_text[message_length - 1]);
-          --message_length) {
-    error_text[message_length - 1] = '\0';
-  }
-
-# endif  // GTEST_OS_WINDOWS_MOBILE
-
-  const std::string error_hex("0x" + String::FormatHexInt(hr));
-  return ::testing::AssertionFailure()
-      << "Expected: " << expr << " " << expected << ".\n"
-      << "  Actual: " << error_hex << " " << error_text << "\n";
-}
-
-}  // namespace
-
-AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
-  if (SUCCEEDED(hr)) {
-    return AssertionSuccess();
-  }
-  return HRESULTFailureHelper(expr, "succeeds", hr);
-}
-
-AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
-  if (FAILED(hr)) {
-    return AssertionSuccess();
-  }
-  return HRESULTFailureHelper(expr, "fails", hr);
-}
-
-#endif  // GTEST_OS_WINDOWS
-
-// Utility functions for encoding Unicode text (wide strings) in
-// UTF-8.
-
-// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
-// like this:
-//
-// Code-point length   Encoding
-//   0 -  7 bits       0xxxxxxx
-//   8 - 11 bits       110xxxxx 10xxxxxx
-//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
-//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-
-// The maximum code-point a one-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
-
-// The maximum code-point a two-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
-
-// The maximum code-point a three-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
-
-// The maximum code-point a four-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
-
-// Chops off the n lowest bits from a bit pattern.  Returns the n
-// lowest bits.  As a side effect, the original bit pattern will be
-// shifted to the right by n bits.
-inline UInt32 ChopLowBits(UInt32* bits, int n) {
-  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
-  *bits >>= n;
-  return low_bits;
-}
-
-// Converts a Unicode code point to a narrow string in UTF-8 encoding.
-// code_point parameter is of type UInt32 because wchar_t may not be
-// wide enough to contain a code point.
-// If the code_point is not a valid Unicode code point
-// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
-// to "(Invalid Unicode 0xXXXXXXXX)".
-std::string CodePointToUtf8(UInt32 code_point) {
-  if (code_point > kMaxCodePoint4) {
-    return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")";
-  }
-
-  char str[5];  // Big enough for the largest valid code point.
-  if (code_point <= kMaxCodePoint1) {
-    str[1] = '\0';
-    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
-  } else if (code_point <= kMaxCodePoint2) {
-    str[2] = '\0';
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
-  } else if (code_point <= kMaxCodePoint3) {
-    str[3] = '\0';
-    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
-  } else {  // code_point <= kMaxCodePoint4
-    str[4] = '\0';
-    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
-  }
-  return str;
-}
-
-// The following two functions only make sense if the the system
-// uses UTF-16 for wide string encoding. All supported systems
-// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
-
-// Determines if the arguments constitute UTF-16 surrogate pair
-// and thus should be combined into a single Unicode code point
-// using CreateCodePointFromUtf16SurrogatePair.
-inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
-  return sizeof(wchar_t) == 2 &&
-      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
-}
-
-// Creates a Unicode code point from UTF16 surrogate pair.
-inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
-                                                    wchar_t second) {
-  const UInt32 mask = (1 << 10) - 1;
-  return (sizeof(wchar_t) == 2) ?
-      (((first & mask) << 10) | (second & mask)) + 0x10000 :
-      // This function should not be called when the condition is
-      // false, but we provide a sensible default in case it is.
-      static_cast<UInt32>(first);
-}
-
-// Converts a wide string to a narrow string in UTF-8 encoding.
-// The wide string is assumed to have the following encoding:
-//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
-//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
-// Parameter str points to a null-terminated wide string.
-// Parameter num_chars may additionally limit the number
-// of wchar_t characters processed. -1 is used when the entire string
-// should be processed.
-// If the string contains code points that are not valid Unicode code points
-// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
-// and contains invalid UTF-16 surrogate pairs, values in those pairs
-// will be encoded as individual Unicode characters from Basic Normal Plane.
-std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
-  if (num_chars == -1)
-    num_chars = static_cast<int>(wcslen(str));
-
-  ::std::stringstream stream;
-  for (int i = 0; i < num_chars; ++i) {
-    UInt32 unicode_code_point;
-
-    if (str[i] == L'\0') {
-      break;
-    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
-      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
-                                                                 str[i + 1]);
-      i++;
-    } else {
-      unicode_code_point = static_cast<UInt32>(str[i]);
-    }
-
-    stream << CodePointToUtf8(unicode_code_point);
-  }
-  return StringStreamToString(&stream);
-}
-
-// Converts a wide C string to an std::string using the UTF-8 encoding.
-// NULL will be converted to "(null)".
-std::string String::ShowWideCString(const wchar_t * wide_c_str) {
-  if (wide_c_str == NULL)  return "(null)";
-
-  return internal::WideStringToUtf8(wide_c_str, -1);
-}
-
-// Compares two wide C strings.  Returns true iff they have the same
-// content.
-//
-// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
-// C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
-  if (lhs == NULL) return rhs == NULL;
-
-  if (rhs == NULL) return false;
-
-  return wcscmp(lhs, rhs) == 0;
-}
-
-// Helper function for *_STREQ on wide strings.
-AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                               const char* actual_expression,
-                               const wchar_t* expected,
-                               const wchar_t* actual) {
-  if (String::WideCStringEquals(expected, actual)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   PrintToString(expected),
-                   PrintToString(actual),
-                   false);
-}
-
-// Helper function for *_STRNE on wide strings.
-AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const wchar_t* s1,
-                               const wchar_t* s2) {
-  if (!String::WideCStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  }
-
-  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                            << s2_expression << "), actual: "
-                            << PrintToString(s1)
-                            << " vs " << PrintToString(s2);
-}
-
-// Compares two C strings, ignoring case.  Returns true iff they have
-// the same content.
-//
-// Unlike strcasecmp(), this function can handle NULL argument(s).  A
-// NULL C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
-  if (lhs == NULL)
-    return rhs == NULL;
-  if (rhs == NULL)
-    return false;
-  return posix::StrCaseCmp(lhs, rhs) == 0;
-}
-
-  // Compares two wide C strings, ignoring case.  Returns true iff they
-  // have the same content.
-  //
-  // Unlike wcscasecmp(), this function can handle NULL argument(s).
-  // A NULL C string is considered different to any non-NULL wide C string,
-  // including the empty string.
-  // NB: The implementations on different platforms slightly differ.
-  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
-  // environment variable. On GNU platform this method uses wcscasecmp
-  // which compares according to LC_CTYPE category of the current locale.
-  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
-  // current locale.
-bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
-                                              const wchar_t* rhs) {
-  if (lhs == NULL) return rhs == NULL;
-
-  if (rhs == NULL) return false;
-
-#if GTEST_OS_WINDOWS
-  return _wcsicmp(lhs, rhs) == 0;
-#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
-  return wcscasecmp(lhs, rhs) == 0;
-#else
-  // Android, Mac OS X and Cygwin don't define wcscasecmp.
-  // Other unknown OSes may not define it either.
-  wint_t left, right;
-  do {
-    left = towlower(*lhs++);
-    right = towlower(*rhs++);
-  } while (left && left == right);
-  return left == right;
-#endif  // OS selector
-}
-
-// Returns true iff str ends with the given suffix, ignoring case.
-// Any string is considered to end with an empty suffix.
-bool String::EndsWithCaseInsensitive(
-    const std::string& str, const std::string& suffix) {
-  const size_t str_len = str.length();
-  const size_t suffix_len = suffix.length();
-  return (str_len >= suffix_len) &&
-         CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
-                                      suffix.c_str());
-}
-
-// Formats an int value as "%02d".
-std::string String::FormatIntWidth2(int value) {
-  std::stringstream ss;
-  ss << std::setfill('0') << std::setw(2) << value;
-  return ss.str();
-}
-
-// Formats an int value as "%X".
-std::string String::FormatHexInt(int value) {
-  std::stringstream ss;
-  ss << std::hex << std::uppercase << value;
-  return ss.str();
-}
-
-// Formats a byte as "%02X".
-std::string String::FormatByte(unsigned char value) {
-  std::stringstream ss;
-  ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
-     << static_cast<unsigned int>(value);
-  return ss.str();
-}
-
-// Converts the buffer in a stringstream to an std::string, converting NUL
-// bytes to "\\0" along the way.
-std::string StringStreamToString(::std::stringstream* ss) {
-  const ::std::string& str = ss->str();
-  const char* const start = str.c_str();
-  const char* const end = start + str.length();
-
-  std::string result;
-  result.reserve(2 * (end - start));
-  for (const char* ch = start; ch != end; ++ch) {
-    if (*ch == '\0') {
-      result += "\\0";  // Replaces NUL with "\\0";
-    } else {
-      result += *ch;
-    }
-  }
-
-  return result;
-}
-
-// Appends the user-supplied message to the Google-Test-generated message.
-std::string AppendUserMessage(const std::string& gtest_msg,
-                              const Message& user_msg) {
-  // Appends the user message if it's non-empty.
-  const std::string user_msg_string = user_msg.GetString();
-  if (user_msg_string.empty()) {
-    return gtest_msg;
-  }
-
-  return gtest_msg + "\n" + user_msg_string;
-}
-
-}  // namespace internal
-
-// class TestResult
-
-// Creates an empty TestResult.
-TestResult::TestResult()
-    : death_test_count_(0),
-      elapsed_time_(0) {
-}
-
-// D'tor.
-TestResult::~TestResult() {
-}
-
-// Returns the i-th test part result among all the results. i can
-// range from 0 to total_part_count() - 1. If i is not in that range,
-// aborts the program.
-const TestPartResult& TestResult::GetTestPartResult(int i) const {
-  if (i < 0 || i >= total_part_count())
-    internal::posix::Abort();
-  return test_part_results_.at(i);
-}
-
-// Returns the i-th test property. i can range from 0 to
-// test_property_count() - 1. If i is not in that range, aborts the
-// program.
-const TestProperty& TestResult::GetTestProperty(int i) const {
-  if (i < 0 || i >= test_property_count())
-    internal::posix::Abort();
-  return test_properties_.at(i);
-}
-
-// Clears the test part results.
-void TestResult::ClearTestPartResults() {
-  test_part_results_.clear();
-}
-
-// Adds a test part result to the list.
-void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
-  test_part_results_.push_back(test_part_result);
-}
-
-// Adds a test property to the list. If a property with the same key as the
-// supplied property is already represented, the value of this test_property
-// replaces the old value for that key.
-void TestResult::RecordProperty(const std::string& xml_element,
-                                const TestProperty& test_property) {
-  if (!ValidateTestProperty(xml_element, test_property)) {
-    return;
-  }
-  internal::MutexLock lock(&test_properites_mutex_);
-  const std::vector<TestProperty>::iterator property_with_matching_key =
-      std::find_if(test_properties_.begin(), test_properties_.end(),
-                   internal::TestPropertyKeyIs(test_property.key()));
-  if (property_with_matching_key == test_properties_.end()) {
-    test_properties_.push_back(test_property);
-    return;
-  }
-  property_with_matching_key->SetValue(test_property.value());
-}
-
-// The list of reserved attributes used in the <testsuites> element of XML
-// output.
-static const char* const kReservedTestSuitesAttributes[] = {
-  "disabled",
-  "errors",
-  "failures",
-  "name",
-  "random_seed",
-  "tests",
-  "time",
-  "timestamp"
-};
-
-// The list of reserved attributes used in the <testsuite> element of XML
-// output.
-static const char* const kReservedTestSuiteAttributes[] = {
-  "disabled",
-  "errors",
-  "failures",
-  "name",
-  "tests",
-  "time"
-};
-
-// The list of reserved attributes used in the <testcase> element of XML output.
-static const char* const kReservedTestCaseAttributes[] = {
-  "classname",
-  "name",
-  "status",
-  "time",
-  "type_param",
-  "value_param"
-};
-
-template <int kSize>
-std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
-  return std::vector<std::string>(array, array + kSize);
-}
-
-static std::vector<std::string> GetReservedAttributesForElement(
-    const std::string& xml_element) {
-  if (xml_element == "testsuites") {
-    return ArrayAsVector(kReservedTestSuitesAttributes);
-  } else if (xml_element == "testsuite") {
-    return ArrayAsVector(kReservedTestSuiteAttributes);
-  } else if (xml_element == "testcase") {
-    return ArrayAsVector(kReservedTestCaseAttributes);
-  } else {
-    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
-  }
-  // This code is unreachable but some compilers may not realizes that.
-  return std::vector<std::string>();
-}
-
-static std::string FormatWordList(const std::vector<std::string>& words) {
-  Message word_list;
-  for (size_t i = 0; i < words.size(); ++i) {
-    if (i > 0 && words.size() > 2) {
-      word_list << ", ";
-    }
-    if (i == words.size() - 1) {
-      word_list << "and ";
-    }
-    word_list << "'" << words[i] << "'";
-  }
-  return word_list.GetString();
-}
-
-bool ValidateTestPropertyName(const std::string& property_name,
-                              const std::vector<std::string>& reserved_names) {
-  if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
-          reserved_names.end()) {
-    ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
-                  << " (" << FormatWordList(reserved_names)
-                  << " are reserved by " << GTEST_NAME_ << ")";
-    return false;
-  }
-  return true;
-}
-
-// Adds a failure if the key is a reserved attribute of the element named
-// xml_element.  Returns true if the property is valid.
-bool TestResult::ValidateTestProperty(const std::string& xml_element,
-                                      const TestProperty& test_property) {
-  return ValidateTestPropertyName(test_property.key(),
-                                  GetReservedAttributesForElement(xml_element));
-}
-
-// Clears the object.
-void TestResult::Clear() {
-  test_part_results_.clear();
-  test_properties_.clear();
-  death_test_count_ = 0;
-  elapsed_time_ = 0;
-}
-
-// Returns true iff the test failed.
-bool TestResult::Failed() const {
-  for (int i = 0; i < total_part_count(); ++i) {
-    if (GetTestPartResult(i).failed())
-      return true;
-  }
-  return false;
-}
-
-// Returns true iff the test part fatally failed.
-static bool TestPartFatallyFailed(const TestPartResult& result) {
-  return result.fatally_failed();
-}
-
-// Returns true iff the test fatally failed.
-bool TestResult::HasFatalFailure() const {
-  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
-}
-
-// Returns true iff the test part non-fatally failed.
-static bool TestPartNonfatallyFailed(const TestPartResult& result) {
-  return result.nonfatally_failed();
-}
-
-// Returns true iff the test has a non-fatal failure.
-bool TestResult::HasNonfatalFailure() const {
-  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
-}
-
-// Gets the number of all test parts.  This is the sum of the number
-// of successful test parts and the number of failed test parts.
-int TestResult::total_part_count() const {
-  return static_cast<int>(test_part_results_.size());
-}
-
-// Returns the number of the test properties.
-int TestResult::test_property_count() const {
-  return static_cast<int>(test_properties_.size());
-}
-
-// class Test
-
-// Creates a Test object.
-
-// The c'tor saves the values of all Google Test flags.
-Test::Test()
-    : gtest_flag_saver_(new internal::GTestFlagSaver) {
-}
-
-// The d'tor restores the values of all Google Test flags.
-Test::~Test() {
-  delete gtest_flag_saver_;
-}
-
-// Sets up the test fixture.
-//
-// A sub-class may override this.
-void Test::SetUp() {
-}
-
-// Tears down the test fixture.
-//
-// A sub-class may override this.
-void Test::TearDown() {
-}
-
-// Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const std::string& key, const std::string& value) {
-  UnitTest::GetInstance()->RecordProperty(key, value);
-}
-
-// Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const std::string& key, int value) {
-  Message value_message;
-  value_message << value;
-  RecordProperty(key, value_message.GetString().c_str());
-}
-
-namespace internal {
-
-void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
-                                    const std::string& message) {
-  // This function is a friend of UnitTest and as such has access to
-  // AddTestPartResult.
-  UnitTest::GetInstance()->AddTestPartResult(
-      result_type,
-      NULL,  // No info about the source file where the exception occurred.
-      -1,    // We have no info on which line caused the exception.
-      message,
-      "");   // No stack trace, either.
-}
-
-}  // namespace internal
-
-// Google Test requires all tests in the same test case to use the same test
-// fixture class.  This function checks if the current test has the
-// same fixture class as the first test in the current test case.  If
-// yes, it returns true; otherwise it generates a Google Test failure and
-// returns false.
-bool Test::HasSameFixtureClass() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  const TestCase* const test_case = impl->current_test_case();
-
-  // Info about the first test in the current test case.
-  const TestInfo* const first_test_info = test_case->test_info_list()[0];
-  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
-  const char* const first_test_name = first_test_info->name();
-
-  // Info about the current test.
-  const TestInfo* const this_test_info = impl->current_test_info();
-  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
-  const char* const this_test_name = this_test_info->name();
-
-  if (this_fixture_id != first_fixture_id) {
-    // Is the first test defined using TEST?
-    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
-    // Is this test defined using TEST?
-    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
-
-    if (first_is_TEST || this_is_TEST) {
-      // The user mixed TEST and TEST_F in this test case - we'll tell
-      // him/her how to fix it.
-
-      // Gets the name of the TEST and the name of the TEST_F.  Note
-      // that first_is_TEST and this_is_TEST cannot both be true, as
-      // the fixture IDs are different for the two tests.
-      const char* const TEST_name =
-          first_is_TEST ? first_test_name : this_test_name;
-      const char* const TEST_F_name =
-          first_is_TEST ? this_test_name : first_test_name;
-
-      ADD_FAILURE()
-          << "All tests in the same test case must use the same test fixture\n"
-          << "class, so mixing TEST_F and TEST in the same test case is\n"
-          << "illegal.  In test case " << this_test_info->test_case_name()
-          << ",\n"
-          << "test " << TEST_F_name << " is defined using TEST_F but\n"
-          << "test " << TEST_name << " is defined using TEST.  You probably\n"
-          << "want to change the TEST to TEST_F or move it to another test\n"
-          << "case.";
-    } else {
-      // The user defined two fixture classes with the same name in
-      // two namespaces - we'll tell him/her how to fix it.
-      ADD_FAILURE()
-          << "All tests in the same test case must use the same test fixture\n"
-          << "class.  However, in test case "
-          << this_test_info->test_case_name() << ",\n"
-          << "you defined test " << first_test_name
-          << " and test " << this_test_name << "\n"
-          << "using two different test fixture classes.  This can happen if\n"
-          << "the two classes are from different namespaces or translation\n"
-          << "units and have the same name.  You should probably rename one\n"
-          << "of the classes to put the tests into different test cases.";
-    }
-    return false;
-  }
-
-  return true;
-}
-
-#if GTEST_HAS_SEH
-
-// Adds an "exception thrown" fatal failure to the current test.  This
-// function returns its result via an output parameter pointer because VC++
-// prohibits creation of objects with destructors on stack in functions
-// using __try (see error C2712).
-static std::string* FormatSehExceptionMessage(DWORD exception_code,
-                                              const char* location) {
-  Message message;
-  message << "SEH exception with code 0x" << std::setbase(16) <<
-    exception_code << std::setbase(10) << " thrown in " << location << ".";
-
-  return new std::string(message.GetString());
-}
-
-#endif  // GTEST_HAS_SEH
-
-namespace internal {
-
-#if GTEST_HAS_EXCEPTIONS
-
-// Adds an "exception thrown" fatal failure to the current test.
-static std::string FormatCxxExceptionMessage(const char* description,
-                                             const char* location) {
-  Message message;
-  if (description != NULL) {
-    message << "C++ exception with description \"" << description << "\"";
-  } else {
-    message << "Unknown C++ exception";
-  }
-  message << " thrown in " << location << ".";
-
-  return message.GetString();
-}
-
-static std::string PrintTestPartResultToString(
-    const TestPartResult& test_part_result);
-
-GoogleTestFailureException::GoogleTestFailureException(
-    const TestPartResult& failure)
-    : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
-
-#endif  // GTEST_HAS_EXCEPTIONS
-
-// We put these helper functions in the internal namespace as IBM's xlC
-// compiler rejects the code if they were declared static.
-
-// Runs the given method and handles SEH exceptions it throws, when
-// SEH is supported; returns the 0-value for type Result in case of an
-// SEH exception.  (Microsoft compilers cannot handle SEH and C++
-// exceptions in the same function.  Therefore, we provide a separate
-// wrapper function for handling SEH exceptions.)
-template <class T, typename Result>
-Result HandleSehExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
-#if GTEST_HAS_SEH
-  __try {
-    return (object->*method)();
-  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
-      GetExceptionCode())) {
-    // We create the exception message on the heap because VC++ prohibits
-    // creation of objects with destructors on stack in functions using __try
-    // (see error C2712).
-    std::string* exception_message = FormatSehExceptionMessage(
-        GetExceptionCode(), location);
-    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
-                                             *exception_message);
-    delete exception_message;
-    return static_cast<Result>(0);
-  }
-#else
-  (void)location;
-  return (object->*method)();
-#endif  // GTEST_HAS_SEH
-}
-
-// Runs the given method and catches and reports C++ and/or SEH-style
-// exceptions, if they are supported; returns the 0-value for type
-// Result in case of an SEH exception.
-template <class T, typename Result>
-Result HandleExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
-  // NOTE: The user code can affect the way in which Google Test handles
-  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
-  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
-  // after the exception is caught and either report or re-throw the
-  // exception based on the flag's value:
-  //
-  // try {
-  //   // Perform the test method.
-  // } catch (...) {
-  //   if (GTEST_FLAG(catch_exceptions))
-  //     // Report the exception as failure.
-  //   else
-  //     throw;  // Re-throws the original exception.
-  // }
-  //
-  // However, the purpose of this flag is to allow the program to drop into
-  // the debugger when the exception is thrown. On most platforms, once the
-  // control enters the catch block, the exception origin information is
-  // lost and the debugger will stop the program at the point of the
-  // re-throw in this function -- instead of at the point of the original
-  // throw statement in the code under test.  For this reason, we perform
-  // the check early, sacrificing the ability to affect Google Test's
-  // exception handling in the method where the exception is thrown.
-  if (internal::GetUnitTestImpl()->catch_exceptions()) {
-#if GTEST_HAS_EXCEPTIONS
-    try {
-      return HandleSehExceptionsInMethodIfSupported(object, method, location);
-    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
-      // This exception type can only be thrown by a failed Google
-      // Test assertion with the intention of letting another testing
-      // framework catch it.  Therefore we just re-throw it.
-      throw;
-    } catch (const std::exception& e) {  // NOLINT
-      internal::ReportFailureInUnknownLocation(
-          TestPartResult::kFatalFailure,
-          FormatCxxExceptionMessage(e.what(), location));
-    } catch (...) {  // NOLINT
-      internal::ReportFailureInUnknownLocation(
-          TestPartResult::kFatalFailure,
-          FormatCxxExceptionMessage(NULL, location));
-    }
-    return static_cast<Result>(0);
-#else
-    return HandleSehExceptionsInMethodIfSupported(object, method, location);
-#endif  // GTEST_HAS_EXCEPTIONS
-  } else {
-    return (object->*method)();
-  }
-}
-
-}  // namespace internal
-
-// Runs the test and updates the test result.
-void Test::Run() {
-  if (!HasSameFixtureClass()) return;
-
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
-  // We will run the test only if SetUp() was successful.
-  if (!HasFatalFailure()) {
-    impl->os_stack_trace_getter()->UponLeavingGTest();
-    internal::HandleExceptionsInMethodIfSupported(
-        this, &Test::TestBody, "the test body");
-  }
-
-  // However, we want to clean up as much as possible.  Hence we will
-  // always call TearDown(), even if SetUp() or the test body has
-  // failed.
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &Test::TearDown, "TearDown()");
-}
-
-// Returns true iff the current test has a fatal failure.
-bool Test::HasFatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
-}
-
-// Returns true iff the current test has a non-fatal failure.
-bool Test::HasNonfatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->
-      HasNonfatalFailure();
-}
-
-// class TestInfo
-
-// Constructs a TestInfo object. It assumes ownership of the test factory
-// object.
-TestInfo::TestInfo(const std::string& a_test_case_name,
-                   const std::string& a_name,
-                   const char* a_type_param,
-                   const char* a_value_param,
-                   internal::TypeId fixture_class_id,
-                   internal::TestFactoryBase* factory)
-    : test_case_name_(a_test_case_name),
-      name_(a_name),
-      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
-      value_param_(a_value_param ? new std::string(a_value_param) : NULL),
-      fixture_class_id_(fixture_class_id),
-      should_run_(false),
-      is_disabled_(false),
-      matches_filter_(false),
-      factory_(factory),
-      result_() {}
-
-// Destructs a TestInfo object.
-TestInfo::~TestInfo() { delete factory_; }
-
-namespace internal {
-
-// Creates a new TestInfo object and registers it with Google Test;
-// returns the created object.
-//
-// Arguments:
-//
-//   test_case_name:   name of the test case
-//   name:             name of the test
-//   type_param:       the name of the test's type parameter, or NULL if
-//                     this is not a typed or a type-parameterized test.
-//   value_param:      text representation of the test's value parameter,
-//                     or NULL if this is not a value-parameterized test.
-//   fixture_class_id: ID of the test fixture class
-//   set_up_tc:        pointer to the function that sets up the test case
-//   tear_down_tc:     pointer to the function that tears down the test case
-//   factory:          pointer to the factory that creates a test object.
-//                     The newly created TestInfo instance will assume
-//                     ownership of the factory object.
-TestInfo* MakeAndRegisterTestInfo(
-    const char* test_case_name,
-    const char* name,
-    const char* type_param,
-    const char* value_param,
-    TypeId fixture_class_id,
-    SetUpTestCaseFunc set_up_tc,
-    TearDownTestCaseFunc tear_down_tc,
-    TestFactoryBase* factory) {
-  TestInfo* const test_info =
-      new TestInfo(test_case_name, name, type_param, value_param,
-                   fixture_class_id, factory);
-  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
-  return test_info;
-}
-
-#if GTEST_HAS_PARAM_TEST
-void ReportInvalidTestCaseType(const char* test_case_name,
-                               const char* file, int line) {
-  Message errors;
-  errors
-      << "Attempted redefinition of test case " << test_case_name << ".\n"
-      << "All tests in the same test case must use the same test fixture\n"
-      << "class.  However, in test case " << test_case_name << ", you tried\n"
-      << "to define a test using a fixture class different from the one\n"
-      << "used earlier. This can happen if the two fixture classes are\n"
-      << "from different namespaces and have the same name. You should\n"
-      << "probably rename one of the classes to put the tests into different\n"
-      << "test cases.";
-
-  fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
-          errors.GetString().c_str());
-}
-#endif  // GTEST_HAS_PARAM_TEST
-
-}  // namespace internal
-
-namespace {
-
-// A predicate that checks the test name of a TestInfo against a known
-// value.
-//
-// This is used for implementation of the TestCase class only.  We put
-// it in the anonymous namespace to prevent polluting the outer
-// namespace.
-//
-// TestNameIs is copyable.
-class TestNameIs {
- public:
-  // Constructor.
-  //
-  // TestNameIs has NO default constructor.
-  explicit TestNameIs(const char* name)
-      : name_(name) {}
-
-  // Returns true iff the test name of test_info matches name_.
-  bool operator()(const TestInfo * test_info) const {
-    return test_info && test_info->name() == name_;
-  }
-
- private:
-  std::string name_;
-};
-
-}  // namespace
-
-namespace internal {
-
-// This method expands all parameterized tests registered with macros TEST_P
-// and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
-// This will be done just once during the program runtime.
-void UnitTestImpl::RegisterParameterizedTests() {
-#if GTEST_HAS_PARAM_TEST
-  if (!parameterized_tests_registered_) {
-    parameterized_test_registry_.RegisterTests();
-    parameterized_tests_registered_ = true;
-  }
-#endif
-}
-
-}  // namespace internal
-
-// Creates the test object, runs it, records its result, and then
-// deletes it.
-void TestInfo::Run() {
-  if (!should_run_) return;
-
-  // Tells UnitTest where to store test result.
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->set_current_test_info(this);
-
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
-  // Notifies the unit test event listeners that a test is about to start.
-  repeater->OnTestStart(*this);
-
-  const TimeInMillis start = internal::GetTimeInMillis();
-
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-
-  // Creates the test object.
-  Test* const test = internal::HandleExceptionsInMethodIfSupported(
-      factory_, &internal::TestFactoryBase::CreateTest,
-      "the test fixture's constructor");
-
-  // Runs the test only if the test object was created and its
-  // constructor didn't generate a fatal failure.
-  if ((test != NULL) && !Test::HasFatalFailure()) {
-    // This doesn't throw as all user code that can throw are wrapped into
-    // exception handling code.
-    test->Run();
-  }
-
-  // Deletes the test object.
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      test, &Test::DeleteSelf_, "the test fixture's destructor");
-
-  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
-
-  // Notifies the unit test event listener that a test has just finished.
-  repeater->OnTestEnd(*this);
-
-  // Tells UnitTest to stop associating assertion results to this
-  // test.
-  impl->set_current_test_info(NULL);
-}
-
-// class TestCase
-
-// Gets the number of successful tests in this test case.
-int TestCase::successful_test_count() const {
-  return CountIf(test_info_list_, TestPassed);
-}
-
-// Gets the number of failed tests in this test case.
-int TestCase::failed_test_count() const {
-  return CountIf(test_info_list_, TestFailed);
-}
-
-// Gets the number of disabled tests that will be reported in the XML report.
-int TestCase::reportable_disabled_test_count() const {
-  return CountIf(test_info_list_, TestReportableDisabled);
-}
-
-// Gets the number of disabled tests in this test case.
-int TestCase::disabled_test_count() const {
-  return CountIf(test_info_list_, TestDisabled);
-}
-
-// Gets the number of tests to be printed in the XML report.
-int TestCase::reportable_test_count() const {
-  return CountIf(test_info_list_, TestReportable);
-}
-
-// Get the number of tests in this test case that should run.
-int TestCase::test_to_run_count() const {
-  return CountIf(test_info_list_, ShouldRunTest);
-}
-
-// Gets the number of all tests.
-int TestCase::total_test_count() const {
-  return static_cast<int>(test_info_list_.size());
-}
-
-// Creates a TestCase with the given name.
-//
-// Arguments:
-//
-//   name:         name of the test case
-//   a_type_param: the name of the test case's type parameter, or NULL if
-//                 this is not a typed or a type-parameterized test case.
-//   set_up_tc:    pointer to the function that sets up the test case
-//   tear_down_tc: pointer to the function that tears down the test case
-TestCase::TestCase(const char* a_name, const char* a_type_param,
-                   Test::SetUpTestCaseFunc set_up_tc,
-                   Test::TearDownTestCaseFunc tear_down_tc)
-    : name_(a_name),
-      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
-      set_up_tc_(set_up_tc),
-      tear_down_tc_(tear_down_tc),
-      should_run_(false),
-      elapsed_time_(0) {
-}
-
-// Destructor of TestCase.
-TestCase::~TestCase() {
-  // Deletes every Test in the collection.
-  ForEach(test_info_list_, internal::Delete<TestInfo>);
-}
-
-// Returns the i-th test among all the tests. i can range from 0 to
-// total_test_count() - 1. If i is not in that range, returns NULL.
-const TestInfo* TestCase::GetTestInfo(int i) const {
-  const int index = GetElementOr(test_indices_, i, -1);
-  return index < 0 ? NULL : test_info_list_[index];
-}
-
-// Returns the i-th test among all the tests. i can range from 0 to
-// total_test_count() - 1. If i is not in that range, returns NULL.
-TestInfo* TestCase::GetMutableTestInfo(int i) {
-  const int index = GetElementOr(test_indices_, i, -1);
-  return index < 0 ? NULL : test_info_list_[index];
-}
-
-// Adds a test to this test case.  Will delete the test upon
-// destruction of the TestCase object.
-void TestCase::AddTestInfo(TestInfo * test_info) {
-  test_info_list_.push_back(test_info);
-  test_indices_.push_back(static_cast<int>(test_indices_.size()));
-}
-
-// Runs every test in this TestCase.
-void TestCase::Run() {
-  if (!should_run_) return;
-
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->set_current_test_case(this);
-
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
-  repeater->OnTestCaseStart(*this);
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &TestCase::RunSetUpTestCase, "SetUpTestCase()");
-
-  const internal::TimeInMillis start = internal::GetTimeInMillis();
-  for (int i = 0; i < total_test_count(); i++) {
-    GetMutableTestInfo(i)->Run();
-  }
-  elapsed_time_ = internal::GetTimeInMillis() - start;
-
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &TestCase::RunTearDownTestCase, "TearDownTestCase()");
-
-  repeater->OnTestCaseEnd(*this);
-  impl->set_current_test_case(NULL);
-}
-
-// Clears the results of all tests in this test case.
-void TestCase::ClearResult() {
-  ad_hoc_test_result_.Clear();
-  ForEach(test_info_list_, TestInfo::ClearTestResult);
-}
-
-// Shuffles the tests in this test case.
-void TestCase::ShuffleTests(internal::Random* random) {
-  Shuffle(random, &test_indices_);
-}
-
-// Restores the test order to before the first shuffle.
-void TestCase::UnshuffleTests() {
-  for (size_t i = 0; i < test_indices_.size(); i++) {
-    test_indices_[i] = static_cast<int>(i);
-  }
-}
-
-// Formats a countable noun.  Depending on its quantity, either the
-// singular form or the plural form is used. e.g.
-//
-// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
-// FormatCountableNoun(5, "book", "books") returns "5 books".
-static std::string FormatCountableNoun(int count,
-                                       const char * singular_form,
-                                       const char * plural_form) {
-  return internal::StreamableToString(count) + " " +
-      (count == 1 ? singular_form : plural_form);
-}
-
-// Formats the count of tests.
-static std::string FormatTestCount(int test_count) {
-  return FormatCountableNoun(test_count, "test", "tests");
-}
-
-// Formats the count of test cases.
-static std::string FormatTestCaseCount(int test_case_count) {
-  return FormatCountableNoun(test_case_count, "test case", "test cases");
-}
-
-// Converts a TestPartResult::Type enum to human-friendly string
-// representation.  Both kNonFatalFailure and kFatalFailure are translated
-// to "Failure", as the user usually doesn't care about the difference
-// between the two when viewing the test result.
-static const char * TestPartResultTypeToString(TestPartResult::Type type) {
-  switch (type) {
-    case TestPartResult::kSuccess:
-      return "Success";
-
-    case TestPartResult::kNonFatalFailure:
-    case TestPartResult::kFatalFailure:
-#ifdef _MSC_VER
-      return "error: ";
-#else
-      return "Failure\n";
-#endif
-    default:
-      return "Unknown result type";
-  }
-}
-
-namespace internal {
-
-// Prints a TestPartResult to an std::string.
-static std::string PrintTestPartResultToString(
-    const TestPartResult& test_part_result) {
-  return (Message()
-          << internal::FormatFileLocation(test_part_result.file_name(),
-                                          test_part_result.line_number())
-          << " " << TestPartResultTypeToString(test_part_result.type())
-          << test_part_result.message()).GetString();
-}
-
-// Prints a TestPartResult.
-static void PrintTestPartResult(const TestPartResult& test_part_result) {
-  const std::string& result =
-      PrintTestPartResultToString(test_part_result);
-  printf("%s\n", result.c_str());
-  fflush(stdout);
-  // If the test program runs in Visual Studio or a debugger, the
-  // following statements add the test part result message to the Output
-  // window such that the user can double-click on it to jump to the
-  // corresponding source code location; otherwise they do nothing.
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-  // We don't call OutputDebugString*() on Windows Mobile, as printing
-  // to stdout is done by OutputDebugString() there already - we don't
-  // want the same message printed twice.
-  ::OutputDebugStringA(result.c_str());
-  ::OutputDebugStringA("\n");
-#endif
-}
-
-// class PrettyUnitTestResultPrinter
-
-enum GTestColor {
-  COLOR_DEFAULT,
-  COLOR_RED,
-  COLOR_GREEN,
-  COLOR_YELLOW
-};
-
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-
-// Returns the character attribute for the given color.
-WORD GetColorAttribute(GTestColor color) {
-  switch (color) {
-    case COLOR_RED:    return FOREGROUND_RED;
-    case COLOR_GREEN:  return FOREGROUND_GREEN;
-    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
-    default:           return 0;
-  }
-}
-
-#else
-
-// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
-// an invalid input.
-const char* GetAnsiColorCode(GTestColor color) {
-  switch (color) {
-    case COLOR_RED:     return "1";
-    case COLOR_GREEN:   return "2";
-    case COLOR_YELLOW:  return "3";
-    default:            return NULL;
-  };
-}
-
-#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-
-// Returns true iff Google Test should use colors in the output.
-bool ShouldUseColor(bool stdout_is_tty) {
-  const char* const gtest_color = GTEST_FLAG(color).c_str();
-
-  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
-#if GTEST_OS_WINDOWS
-    // On Windows the TERM variable is usually not set, but the
-    // console there does support colors.
-    return stdout_is_tty;
-#else
-    // On non-Windows platforms, we rely on the TERM variable.
-    const char* const term = posix::GetEnv("TERM");
-    const bool term_supports_color =
-        String::CStringEquals(term, "xterm") ||
-        String::CStringEquals(term, "xterm-color") ||
-        String::CStringEquals(term, "xterm-256color") ||
-        String::CStringEquals(term, "screen") ||
-        String::CStringEquals(term, "screen-256color") ||
-        String::CStringEquals(term, "linux") ||
-        String::CStringEquals(term, "cygwin");
-    return stdout_is_tty && term_supports_color;
-#endif  // GTEST_OS_WINDOWS
-  }
-
-  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
-      String::CStringEquals(gtest_color, "1");
-  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
-  // value is neither one of these nor "auto", we treat it as "no" to
-  // be conservative.
-}
-
-// Helpers for printing colored strings to stdout. Note that on Windows, we
-// cannot simply emit special characters and have the terminal change colors.
-// This routine must actually emit the characters rather than return a string
-// that would be colored when printed, as can be done on Linux.
-void ColoredPrintf(GTestColor color, const char* fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || GTEST_OS_IOS
-  const bool use_color = false;
-#else
-  static const bool in_color_mode =
-      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
-  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
-#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
-  // The '!= 0' comparison is necessary to satisfy MSVC 7.1.
-
-  if (!use_color) {
-    vprintf(fmt, args);
-    va_end(args);
-    return;
-  }
-
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
-
-  // Gets the current text color.
-  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
-  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
-  const WORD old_color_attrs = buffer_info.wAttributes;
-
-  // We need to flush the stream buffers into the console before each
-  // SetConsoleTextAttribute call lest it affect the text that is already
-  // printed but has not yet reached the console.
-  fflush(stdout);
-  SetConsoleTextAttribute(stdout_handle,
-                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
-  vprintf(fmt, args);
-
-  fflush(stdout);
-  // Restores the text color.
-  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
-#else
-  printf("\033[0;3%sm", GetAnsiColorCode(color));
-  vprintf(fmt, args);
-  printf("\033[m");  // Resets the terminal to default.
-#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-  va_end(args);
-}
-
-// Text printed in Google Test's text output and --gunit_list_tests
-// output to label the type parameter and value parameter for a test.
-static const char kTypeParamLabel[] = "TypeParam";
-static const char kValueParamLabel[] = "GetParam()";
-
-void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
-  const char* const type_param = test_info.type_param();
-  const char* const value_param = test_info.value_param();
-
-  if (type_param != NULL || value_param != NULL) {
-    printf(", where ");
-    if (type_param != NULL) {
-      printf("%s = %s", kTypeParamLabel, type_param);
-      if (value_param != NULL)
-        printf(" and ");
-    }
-    if (value_param != NULL) {
-      printf("%s = %s", kValueParamLabel, value_param);
-    }
-  }
-}
-
-// This class implements the TestEventListener interface.
-//
-// Class PrettyUnitTestResultPrinter is copyable.
-class PrettyUnitTestResultPrinter : public TestEventListener {
- public:
-  PrettyUnitTestResultPrinter() {}
-  static void PrintTestName(const char * test_case, const char * test) {
-    printf("%s.%s", test_case, test);
-  }
-
-  // The following methods override what's in the TestEventListener class.
-  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestCaseStart(const TestCase& test_case);
-  virtual void OnTestStart(const TestInfo& test_info);
-  virtual void OnTestPartResult(const TestPartResult& result);
-  virtual void OnTestEnd(const TestInfo& test_info);
-  virtual void OnTestCaseEnd(const TestCase& test_case);
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
-  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
-
- private:
-  static void PrintFailedTests(const UnitTest& unit_test);
-};
-
-  // Fired before each iteration of tests starts.
-void PrettyUnitTestResultPrinter::OnTestIterationStart(
-    const UnitTest& unit_test, int iteration) {
-  if (GTEST_FLAG(repeat) != 1)
-    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
-
-  const char* const filter = GTEST_FLAG(filter).c_str();
-
-  // Prints the filter if it's not *.  This reminds the user that some
-  // tests may be skipped.
-  if (!String::CStringEquals(filter, kUniversalFilter)) {
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
-  }
-
-  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
-    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: This is test shard %d of %s.\n",
-                  static_cast<int>(shard_index) + 1,
-                  internal::posix::GetEnv(kTestTotalShards));
-  }
-
-  if (GTEST_FLAG(shuffle)) {
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: Randomizing tests' orders with a seed of %d .\n",
-                  unit_test.random_seed());
-  }
-
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
-  printf("Running %s from %s.\n",
-         FormatTestCount(unit_test.test_to_run_count()).c_str(),
-         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
-    const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
-  printf("Global test environment set-up.\n");
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
-  const std::string counts =
-      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
-  printf("%s from %s", counts.c_str(), test_case.name());
-  if (test_case.type_param() == NULL) {
-    printf("\n");
-  } else {
-    printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
-  }
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
-  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
-  PrintTestName(test_info.test_case_name(), test_info.name());
-  printf("\n");
-  fflush(stdout);
-}
-
-// Called after an assertion failure.
-void PrettyUnitTestResultPrinter::OnTestPartResult(
-    const TestPartResult& result) {
-  // If the test part succeeded, we don't need to do anything.
-  if (result.type() == TestPartResult::kSuccess)
-    return;
-
-  // Print failure message from the assertion (e.g. expected this and got that).
-  PrintTestPartResult(result);
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
-  if (test_info.result()->Passed()) {
-    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
-  } else {
-    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
-  }
-  PrintTestName(test_info.test_case_name(), test_info.name());
-  if (test_info.result()->Failed())
-    PrintFullTestCommentIfPresent(test_info);
-
-  if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms)\n", internal::StreamableToString(
-           test_info.result()->elapsed_time()).c_str());
-  } else {
-    printf("\n");
-  }
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
-  if (!GTEST_FLAG(print_time)) return;
-
-  const std::string counts =
-      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
-  printf("%s from %s (%s ms total)\n\n",
-         counts.c_str(), test_case.name(),
-         internal::StreamableToString(test_case.elapsed_time()).c_str());
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
-    const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
-  printf("Global test environment tear-down\n");
-  fflush(stdout);
-}
-
-// Internal helper for printing the list of failed tests.
-void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
-  const int failed_test_count = unit_test.failed_test_count();
-  if (failed_test_count == 0) {
-    return;
-  }
-
-  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
-    const TestCase& test_case = *unit_test.GetTestCase(i);
-    if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
-      continue;
-    }
-    for (int j = 0; j < test_case.total_test_count(); ++j) {
-      const TestInfo& test_info = *test_case.GetTestInfo(j);
-      if (!test_info.should_run() || test_info.result()->Passed()) {
-        continue;
-      }
-      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
-      printf("%s.%s", test_case.name(), test_info.name());
-      PrintFullTestCommentIfPresent(test_info);
-      printf("\n");
-    }
-  }
-}
-
-void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                     int /*iteration*/) {
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
-  printf("%s from %s ran.",
-         FormatTestCount(unit_test.test_to_run_count()).c_str(),
-         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
-  if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms total)",
-           internal::StreamableToString(unit_test.elapsed_time()).c_str());
-  }
-  printf("\n");
-  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
-  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
-
-  int num_failures = unit_test.failed_test_count();
-  if (!unit_test.Passed()) {
-    const int failed_test_count = unit_test.failed_test_count();
-    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
-    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
-    PrintFailedTests(unit_test);
-    printf("\n%2d FAILED %s\n", num_failures,
-                        num_failures == 1 ? "TEST" : "TESTS");
-  }
-
-  int num_disabled = unit_test.reportable_disabled_test_count();
-  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
-    if (!num_failures) {
-      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
-    }
-    ColoredPrintf(COLOR_YELLOW,
-                  "  YOU HAVE %d DISABLED %s\n\n",
-                  num_disabled,
-                  num_disabled == 1 ? "TEST" : "TESTS");
-  }
-  // Ensure that Google Test output is printed before, e.g., heapchecker output.
-  fflush(stdout);
-}
-
-// End PrettyUnitTestResultPrinter
-
-// class TestEventRepeater
-//
-// This class forwards events to other event listeners.
-class TestEventRepeater : public TestEventListener {
- public:
-  TestEventRepeater() : forwarding_enabled_(true) {}
-  virtual ~TestEventRepeater();
-  void Append(TestEventListener *listener);
-  TestEventListener* Release(TestEventListener* listener);
-
-  // Controls whether events will be forwarded to listeners_. Set to false
-  // in death test child processes.
-  bool forwarding_enabled() const { return forwarding_enabled_; }
-  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
-
-  virtual void OnTestProgramStart(const UnitTest& unit_test);
-  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test);
-  virtual void OnTestCaseStart(const TestCase& test_case);
-  virtual void OnTestStart(const TestInfo& test_info);
-  virtual void OnTestPartResult(const TestPartResult& result);
-  virtual void OnTestEnd(const TestInfo& test_info);
-  virtual void OnTestCaseEnd(const TestCase& test_case);
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test);
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
-  virtual void OnTestProgramEnd(const UnitTest& unit_test);
-
- private:
-  // Controls whether events will be forwarded to listeners_. Set to false
-  // in death test child processes.
-  bool forwarding_enabled_;
-  // The list of listeners that receive events.
-  std::vector<TestEventListener*> listeners_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
-};
-
-TestEventRepeater::~TestEventRepeater() {
-  ForEach(listeners_, Delete<TestEventListener>);
-}
-
-void TestEventRepeater::Append(TestEventListener *listener) {
-  listeners_.push_back(listener);
-}
-
-// TODO(vladl@google.com): Factor the search functionality into Vector::Find.
-TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
-  for (size_t i = 0; i < listeners_.size(); ++i) {
-    if (listeners_[i] == listener) {
-      listeners_.erase(listeners_.begin() + i);
-      return listener;
-    }
-  }
-
-  return NULL;
-}
-
-// Since most methods are very similar, use macros to reduce boilerplate.
-// This defines a member that forwards the call to all listeners.
-#define GTEST_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (size_t i = 0; i < listeners_.size(); i++) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
-// This defines a member that forwards the call to all listeners in reverse
-// order.
-#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
-
-GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
-GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
-GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase)
-GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
-GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
-GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
-
-#undef GTEST_REPEATER_METHOD_
-#undef GTEST_REVERSE_REPEATER_METHOD_
-
-void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
-                                             int iteration) {
-  if (forwarding_enabled_) {
-    for (size_t i = 0; i < listeners_.size(); i++) {
-      listeners_[i]->OnTestIterationStart(unit_test, iteration);
-    }
-  }
-}
-
-void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
-                                           int iteration) {
-  if (forwarding_enabled_) {
-    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) {
-      listeners_[i]->OnTestIterationEnd(unit_test, iteration);
-    }
-  }
-}
-
-// End TestEventRepeater
-
-// This class generates an XML output file.
-class XmlUnitTestResultPrinter : public EmptyTestEventListener {
- public:
-  explicit XmlUnitTestResultPrinter(const char* output_file);
-
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
-
- private:
-  // Is c a whitespace character that is normalized to a space character
-  // when it appears in an XML attribute value?
-  static bool IsNormalizableWhitespace(char c) {
-    return c == 0x9 || c == 0xA || c == 0xD;
-  }
-
-  // May c appear in a well-formed XML document?
-  static bool IsValidXmlCharacter(char c) {
-    return IsNormalizableWhitespace(c) || c >= 0x20;
-  }
-
-  // Returns an XML-escaped copy of the input string str.  If
-  // is_attribute is true, the text is meant to appear as an attribute
-  // value, and normalizable whitespace is preserved by replacing it
-  // with character references.
-  static std::string EscapeXml(const std::string& str, bool is_attribute);
-
-  // Returns the given string with all characters invalid in XML removed.
-  static std::string RemoveInvalidXmlCharacters(const std::string& str);
-
-  // Convenience wrapper around EscapeXml when str is an attribute value.
-  static std::string EscapeXmlAttribute(const std::string& str) {
-    return EscapeXml(str, true);
-  }
-
-  // Convenience wrapper around EscapeXml when str is not an attribute value.
-  static std::string EscapeXmlText(const char* str) {
-    return EscapeXml(str, false);
-  }
-
-  // Verifies that the given attribute belongs to the given element and
-  // streams the attribute as XML.
-  static void OutputXmlAttribute(std::ostream* stream,
-                                 const std::string& element_name,
-                                 const std::string& name,
-                                 const std::string& value);
-
-  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
-  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
-
-  // Streams an XML representation of a TestInfo object.
-  static void OutputXmlTestInfo(::std::ostream* stream,
-                                const char* test_case_name,
-                                const TestInfo& test_info);
-
-  // Prints an XML representation of a TestCase object
-  static void PrintXmlTestCase(::std::ostream* stream,
-                               const TestCase& test_case);
-
-  // Prints an XML summary of unit_test to output stream out.
-  static void PrintXmlUnitTest(::std::ostream* stream,
-                               const UnitTest& unit_test);
-
-  // Produces a string representing the test properties in a result as space
-  // delimited XML attributes based on the property key="value" pairs.
-  // When the std::string is not empty, it includes a space at the beginning,
-  // to delimit this attribute from prior attributes.
-  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
-
-  // The output file.
-  const std::string output_file_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
-};
-
-// Creates a new XmlUnitTestResultPrinter.
-XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
-    : output_file_(output_file) {
-  if (output_file_.c_str() == NULL || output_file_.empty()) {
-    fprintf(stderr, "XML output file may not be null\n");
-    fflush(stderr);
-    exit(EXIT_FAILURE);
-  }
-}
-
-// Called after the unit test ends.
-void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                  int /*iteration*/) {
-  FILE* xmlout = NULL;
-  FilePath output_file(output_file_);
-  FilePath output_dir(output_file.RemoveFileName());
-
-  if (output_dir.CreateDirectoriesRecursively()) {
-    xmlout = posix::FOpen(output_file_.c_str(), "w");
-  }
-  if (xmlout == NULL) {
-    // TODO(wan): report the reason of the failure.
-    //
-    // We don't do it for now as:
-    //
-    //   1. There is no urgent need for it.
-    //   2. It's a bit involved to make the errno variable thread-safe on
-    //      all three operating systems (Linux, Windows, and Mac OS).
-    //   3. To interpret the meaning of errno in a thread-safe way,
-    //      we need the strerror_r() function, which is not available on
-    //      Windows.
-    fprintf(stderr,
-            "Unable to open file \"%s\"\n",
-            output_file_.c_str());
-    fflush(stderr);
-    exit(EXIT_FAILURE);
-  }
-  std::stringstream stream;
-  PrintXmlUnitTest(&stream, unit_test);
-  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
-  fclose(xmlout);
-}
-
-// Returns an XML-escaped copy of the input string str.  If is_attribute
-// is true, the text is meant to appear as an attribute value, and
-// normalizable whitespace is preserved by replacing it with character
-// references.
-//
-// Invalid XML characters in str, if any, are stripped from the output.
-// It is expected that most, if not all, of the text processed by this
-// module will consist of ordinary English text.
-// If this module is ever modified to produce version 1.1 XML output,
-// most invalid characters can be retained using character references.
-// TODO(wan): It might be nice to have a minimally invasive, human-readable
-// escaping scheme for invalid characters, rather than dropping them.
-std::string XmlUnitTestResultPrinter::EscapeXml(
-    const std::string& str, bool is_attribute) {
-  Message m;
-
-  for (size_t i = 0; i < str.size(); ++i) {
-    const char ch = str[i];
-    switch (ch) {
-      case '<':
-        m << "&lt;";
-        break;
-      case '>':
-        m << "&gt;";
-        break;
-      case '&':
-        m << "&amp;";
-        break;
-      case '\'':
-        if (is_attribute)
-          m << "&apos;";
-        else
-          m << '\'';
-        break;
-      case '"':
-        if (is_attribute)
-          m << "&quot;";
-        else
-          m << '"';
-        break;
-      default:
-        if (IsValidXmlCharacter(ch)) {
-          if (is_attribute && IsNormalizableWhitespace(ch))
-            m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
-              << ";";
-          else
-            m << ch;
-        }
-        break;
-    }
-  }
-
-  return m.GetString();
-}
-
-// Returns the given string with all characters invalid in XML removed.
-// Currently invalid characters are dropped from the string. An
-// alternative is to replace them with certain characters such as . or ?.
-std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
-    const std::string& str) {
-  std::string output;
-  output.reserve(str.size());
-  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
-    if (IsValidXmlCharacter(*it))
-      output.push_back(*it);
-
-  return output;
-}
-
-// The following routines generate an XML representation of a UnitTest
-// object.
-//
-// This is how Google Test concepts map to the DTD:
-//
-// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
-//   <testsuite name="testcase-name">  <-- corresponds to a TestCase object
-//     <testcase name="test-name">     <-- corresponds to a TestInfo object
-//       <failure message="...">...</failure>
-//       <failure message="...">...</failure>
-//       <failure message="...">...</failure>
-//                                     <-- individual assertion failures
-//     </testcase>
-//   </testsuite>
-// </testsuites>
-
-// Formats the given time in milliseconds as seconds.
-std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
-  ::std::stringstream ss;
-  ss << ms/1000.0;
-  return ss.str();
-}
-
-// Converts the given epoch time in milliseconds to a date string in the ISO
-// 8601 format, without the timezone information.
-std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
-  // Using non-reentrant version as localtime_r is not portable.
-  time_t seconds = static_cast<time_t>(ms / 1000);
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4996)  // Temporarily disables warning 4996
-                                // (function or variable may be unsafe).
-  const struct tm* const time_struct = localtime(&seconds);  // NOLINT
-# pragma warning(pop)           // Restores the warning state again.
-#else
-  const struct tm* const time_struct = localtime(&seconds);  // NOLINT
-#endif
-  if (time_struct == NULL)
-    return "";  // Invalid ms value
-
-  // YYYY-MM-DDThh:mm:ss
-  return StreamableToString(time_struct->tm_year + 1900) + "-" +
-      String::FormatIntWidth2(time_struct->tm_mon + 1) + "-" +
-      String::FormatIntWidth2(time_struct->tm_mday) + "T" +
-      String::FormatIntWidth2(time_struct->tm_hour) + ":" +
-      String::FormatIntWidth2(time_struct->tm_min) + ":" +
-      String::FormatIntWidth2(time_struct->tm_sec);
-}
-
-// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
-void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
-                                                     const char* data) {
-  const char* segment = data;
-  *stream << "<![CDATA[";
-  for (;;) {
-    const char* const next_segment = strstr(segment, "]]>");
-    if (next_segment != NULL) {
-      stream->write(
-          segment, static_cast<std::streamsize>(next_segment - segment));
-      *stream << "]]>]]&gt;<![CDATA[";
-      segment = next_segment + strlen("]]>");
-    } else {
-      *stream << segment;
-      break;
-    }
-  }
-  *stream << "]]>";
-}
-
-void XmlUnitTestResultPrinter::OutputXmlAttribute(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    const std::string& value) {
-  const std::vector<std::string>& allowed_names =
-      GetReservedAttributesForElement(element_name);
-
-  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
-      << "Attribute " << name << " is not allowed for element <" << element_name
-      << ">.";
-
-  *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
-}
-
-// Prints an XML representation of a TestInfo object.
-// TODO(wan): There is also value in printing properties with the plain printer.
-void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
-                                                 const char* test_case_name,
-                                                 const TestInfo& test_info) {
-  const TestResult& result = *test_info.result();
-  const std::string kTestcase = "testcase";
-
-  *stream << "    <testcase";
-  OutputXmlAttribute(stream, kTestcase, "name", test_info.name());
-
-  if (test_info.value_param() != NULL) {
-    OutputXmlAttribute(stream, kTestcase, "value_param",
-                       test_info.value_param());
-  }
-  if (test_info.type_param() != NULL) {
-    OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param());
-  }
-
-  OutputXmlAttribute(stream, kTestcase, "status",
-                     test_info.should_run() ? "run" : "notrun");
-  OutputXmlAttribute(stream, kTestcase, "time",
-                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
-  OutputXmlAttribute(stream, kTestcase, "classname", test_case_name);
-  *stream << TestPropertiesAsXmlAttributes(result);
-
-  int failures = 0;
-  for (int i = 0; i < result.total_part_count(); ++i) {
-    const TestPartResult& part = result.GetTestPartResult(i);
-    if (part.failed()) {
-      if (++failures == 1) {
-        *stream << ">\n";
-      }
-      const string location = internal::FormatCompilerIndependentFileLocation(
-          part.file_name(), part.line_number());
-      const string summary = location + "\n" + part.summary();
-      *stream << "      <failure message=\""
-              << EscapeXmlAttribute(summary.c_str())
-              << "\" type=\"\">";
-      const string detail = location + "\n" + part.message();
-      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
-      *stream << "</failure>\n";
-    }
-  }
-
-  if (failures == 0)
-    *stream << " />\n";
-  else
-    *stream << "    </testcase>\n";
-}
-
-// Prints an XML representation of a TestCase object
-void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
-                                                const TestCase& test_case) {
-  const std::string kTestsuite = "testsuite";
-  *stream << "  <" << kTestsuite;
-  OutputXmlAttribute(stream, kTestsuite, "name", test_case.name());
-  OutputXmlAttribute(stream, kTestsuite, "tests",
-                     StreamableToString(test_case.reportable_test_count()));
-  OutputXmlAttribute(stream, kTestsuite, "failures",
-                     StreamableToString(test_case.failed_test_count()));
-  OutputXmlAttribute(
-      stream, kTestsuite, "disabled",
-      StreamableToString(test_case.reportable_disabled_test_count()));
-  OutputXmlAttribute(stream, kTestsuite, "errors", "0");
-  OutputXmlAttribute(stream, kTestsuite, "time",
-                     FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
-  *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result())
-          << ">\n";
-
-  for (int i = 0; i < test_case.total_test_count(); ++i) {
-    if (test_case.GetTestInfo(i)->is_reportable())
-      OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
-  }
-  *stream << "  </" << kTestsuite << ">\n";
-}
-
-// Prints an XML summary of unit_test to output stream out.
-void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
-                                                const UnitTest& unit_test) {
-  const std::string kTestsuites = "testsuites";
-
-  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
-  *stream << "<" << kTestsuites;
-
-  OutputXmlAttribute(stream, kTestsuites, "tests",
-                     StreamableToString(unit_test.reportable_test_count()));
-  OutputXmlAttribute(stream, kTestsuites, "failures",
-                     StreamableToString(unit_test.failed_test_count()));
-  OutputXmlAttribute(
-      stream, kTestsuites, "disabled",
-      StreamableToString(unit_test.reportable_disabled_test_count()));
-  OutputXmlAttribute(stream, kTestsuites, "errors", "0");
-  OutputXmlAttribute(
-      stream, kTestsuites, "timestamp",
-      FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
-  OutputXmlAttribute(stream, kTestsuites, "time",
-                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
-
-  if (GTEST_FLAG(shuffle)) {
-    OutputXmlAttribute(stream, kTestsuites, "random_seed",
-                       StreamableToString(unit_test.random_seed()));
-  }
-
-  *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
-
-  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
-  *stream << ">\n";
-
-  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
-    if (unit_test.GetTestCase(i)->reportable_test_count() > 0)
-      PrintXmlTestCase(stream, *unit_test.GetTestCase(i));
-  }
-  *stream << "</" << kTestsuites << ">\n";
-}
-
-// Produces a string representing the test properties in a result as space
-// delimited XML attributes based on the property key="value" pairs.
-std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
-    const TestResult& result) {
-  Message attributes;
-  for (int i = 0; i < result.test_property_count(); ++i) {
-    const TestProperty& property = result.GetTestProperty(i);
-    attributes << " " << property.key() << "="
-        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
-  }
-  return attributes.GetString();
-}
-
-// End XmlUnitTestResultPrinter
-
-#if GTEST_CAN_STREAM_RESULTS_
-
-// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
-// replaces them by "%xx" where xx is their hexadecimal value. For
-// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
-// in both time and space -- important as the input str may contain an
-// arbitrarily long test failure message and stack trace.
-string StreamingListener::UrlEncode(const char* str) {
-  string result;
-  result.reserve(strlen(str) + 1);
-  for (char ch = *str; ch != '\0'; ch = *++str) {
-    switch (ch) {
-      case '%':
-      case '=':
-      case '&':
-      case '\n':
-        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
-        break;
-      default:
-        result.push_back(ch);
-        break;
-    }
-  }
-  return result;
-}
-
-void StreamingListener::SocketWriter::MakeConnection() {
-  GTEST_CHECK_(sockfd_ == -1)
-      << "MakeConnection() can't be called when there is already a connection.";
-
-  addrinfo hints;
-  memset(&hints, 0, sizeof(hints));
-  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
-  hints.ai_socktype = SOCK_STREAM;
-  addrinfo* servinfo = NULL;
-
-  // Use the getaddrinfo() to get a linked list of IP addresses for
-  // the given host name.
-  const int error_num = getaddrinfo(
-      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
-  if (error_num != 0) {
-    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
-                        << gai_strerror(error_num);
-  }
-
-  // Loop through all the results and connect to the first we can.
-  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
-       cur_addr = cur_addr->ai_next) {
-    sockfd_ = socket(
-        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
-    if (sockfd_ != -1) {
-      // Connect the client socket to the server socket.
-      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
-        close(sockfd_);
-        sockfd_ = -1;
-      }
-    }
-  }
-
-  freeaddrinfo(servinfo);  // all done with this structure
-
-  if (sockfd_ == -1) {
-    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
-                        << host_name_ << ":" << port_num_;
-  }
-}
-
-// End of class Streaming Listener
-#endif  // GTEST_CAN_STREAM_RESULTS__
-
-// Class ScopedTrace
-
-// Pushes the given source file location and message onto a per-thread
-// trace stack maintained by Google Test.
-ScopedTrace::ScopedTrace(const char* file, int line, const Message& message)
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
-  TraceInfo trace;
-  trace.file = file;
-  trace.line = line;
-  trace.message = message.GetString();
-
-  UnitTest::GetInstance()->PushGTestTrace(trace);
-}
-
-// Pops the info pushed by the c'tor.
-ScopedTrace::~ScopedTrace()
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
-  UnitTest::GetInstance()->PopGTestTrace();
-}
-
-
-// class OsStackTraceGetter
-
-// Returns the current OS stack trace as an std::string.  Parameters:
-//
-//   max_depth  - the maximum number of stack frames to be included
-//                in the trace.
-//   skip_count - the number of top frames to be skipped; doesn't count
-//                against max_depth.
-//
-string OsStackTraceGetter::CurrentStackTrace(int /* max_depth */,
-                                             int /* skip_count */)
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  return "";
-}
-
-void OsStackTraceGetter::UponLeavingGTest()
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-}
-
-const char* const
-OsStackTraceGetter::kElidedFramesMarker =
-    "... " GTEST_NAME_ " internal frames ...";
-
-// A helper class that creates the premature-exit file in its
-// constructor and deletes the file in its destructor.
-class ScopedPrematureExitFile {
- public:
-  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
-      : premature_exit_filepath_(premature_exit_filepath) {
-    // If a path to the premature-exit file is specified...
-    if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') {
-      // create the file with a single "0" character in it.  I/O
-      // errors are ignored as there's nothing better we can do and we
-      // don't want to fail the test because of this.
-      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
-      fwrite("0", 1, 1, pfile);
-      fclose(pfile);
-    }
-  }
-
-  ~ScopedPrematureExitFile() {
-    if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') {
-      remove(premature_exit_filepath_);
-    }
-  }
-
- private:
-  const char* const premature_exit_filepath_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
-};
-
-}  // namespace internal
-
-// class TestEventListeners
-
-TestEventListeners::TestEventListeners()
-    : repeater_(new internal::TestEventRepeater()),
-      default_result_printer_(NULL),
-      default_xml_generator_(NULL) {
-}
-
-TestEventListeners::~TestEventListeners() { delete repeater_; }
-
-// Returns the standard listener responsible for the default console
-// output.  Can be removed from the listeners list to shut down default
-// console output.  Note that removing this object from the listener list
-// with Release transfers its ownership to the user.
-void TestEventListeners::Append(TestEventListener* listener) {
-  repeater_->Append(listener);
-}
-
-// Removes the given event listener from the list and returns it.  It then
-// becomes the caller's responsibility to delete the listener. Returns
-// NULL if the listener is not found in the list.
-TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
-  if (listener == default_result_printer_)
-    default_result_printer_ = NULL;
-  else if (listener == default_xml_generator_)
-    default_xml_generator_ = NULL;
-  return repeater_->Release(listener);
-}
-
-// Returns repeater that broadcasts the TestEventListener events to all
-// subscribers.
-TestEventListener* TestEventListeners::repeater() { return repeater_; }
-
-// Sets the default_result_printer attribute to the provided listener.
-// The listener is also added to the listener list and previous
-// default_result_printer is removed from it and deleted. The listener can
-// also be NULL in which case it will not be added to the list. Does
-// nothing if the previous and the current listener objects are the same.
-void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
-  if (default_result_printer_ != listener) {
-    // It is an error to pass this method a listener that is already in the
-    // list.
-    delete Release(default_result_printer_);
-    default_result_printer_ = listener;
-    if (listener != NULL)
-      Append(listener);
-  }
-}
-
-// Sets the default_xml_generator attribute to the provided listener.  The
-// listener is also added to the listener list and previous
-// default_xml_generator is removed from it and deleted. The listener can
-// also be NULL in which case it will not be added to the list. Does
-// nothing if the previous and the current listener objects are the same.
-void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
-  if (default_xml_generator_ != listener) {
-    // It is an error to pass this method a listener that is already in the
-    // list.
-    delete Release(default_xml_generator_);
-    default_xml_generator_ = listener;
-    if (listener != NULL)
-      Append(listener);
-  }
-}
-
-// Controls whether events will be forwarded by the repeater to the
-// listeners in the list.
-bool TestEventListeners::EventForwardingEnabled() const {
-  return repeater_->forwarding_enabled();
-}
-
-void TestEventListeners::SuppressEventForwarding() {
-  repeater_->set_forwarding_enabled(false);
-}
-
-// class UnitTest
-
-// Gets the singleton UnitTest object.  The first time this method is
-// called, a UnitTest object is constructed and returned.  Consecutive
-// calls will return the same object.
-//
-// We don't protect this under mutex_ as a user is not supposed to
-// call this before main() starts, from which point on the return
-// value will never change.
-UnitTest* UnitTest::GetInstance() {
-  // When compiled with MSVC 7.1 in optimized mode, destroying the
-  // UnitTest object upon exiting the program messes up the exit code,
-  // causing successful tests to appear failed.  We have to use a
-  // different implementation in this case to bypass the compiler bug.
-  // This implementation makes the compiler happy, at the cost of
-  // leaking the UnitTest object.
-
-  // CodeGear C++Builder insists on a public destructor for the
-  // default implementation.  Use this implementation to keep good OO
-  // design with private destructor.
-
-#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
-  static UnitTest* const instance = new UnitTest;
-  return instance;
-#else
-  static UnitTest instance;
-  return &instance;
-#endif  // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
-}
-
-// Gets the number of successful test cases.
-int UnitTest::successful_test_case_count() const {
-  return impl()->successful_test_case_count();
-}
-
-// Gets the number of failed test cases.
-int UnitTest::failed_test_case_count() const {
-  return impl()->failed_test_case_count();
-}
-
-// Gets the number of all test cases.
-int UnitTest::total_test_case_count() const {
-  return impl()->total_test_case_count();
-}
-
-// Gets the number of all test cases that contain at least one test
-// that should run.
-int UnitTest::test_case_to_run_count() const {
-  return impl()->test_case_to_run_count();
-}
-
-// Gets the number of successful tests.
-int UnitTest::successful_test_count() const {
-  return impl()->successful_test_count();
-}
-
-// Gets the number of failed tests.
-int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
-
-// Gets the number of disabled tests that will be reported in the XML report.
-int UnitTest::reportable_disabled_test_count() const {
-  return impl()->reportable_disabled_test_count();
-}
-
-// Gets the number of disabled tests.
-int UnitTest::disabled_test_count() const {
-  return impl()->disabled_test_count();
-}
-
-// Gets the number of tests to be printed in the XML report.
-int UnitTest::reportable_test_count() const {
-  return impl()->reportable_test_count();
-}
-
-// Gets the number of all tests.
-int UnitTest::total_test_count() const { return impl()->total_test_count(); }
-
-// Gets the number of tests that should run.
-int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
-
-// Gets the time of the test program start, in ms from the start of the
-// UNIX epoch.
-internal::TimeInMillis UnitTest::start_timestamp() const {
-    return impl()->start_timestamp();
-}
-
-// Gets the elapsed time, in milliseconds.
-internal::TimeInMillis UnitTest::elapsed_time() const {
-  return impl()->elapsed_time();
-}
-
-// Returns true iff the unit test passed (i.e. all test cases passed).
-bool UnitTest::Passed() const { return impl()->Passed(); }
-
-// Returns true iff the unit test failed (i.e. some test case failed
-// or something outside of all tests failed).
-bool UnitTest::Failed() const { return impl()->Failed(); }
-
-// Gets the i-th test case among all the test cases. i can range from 0 to
-// total_test_case_count() - 1. If i is not in that range, returns NULL.
-const TestCase* UnitTest::GetTestCase(int i) const {
-  return impl()->GetTestCase(i);
-}
-
-// Returns the TestResult containing information on test failures and
-// properties logged outside of individual test cases.
-const TestResult& UnitTest::ad_hoc_test_result() const {
-  return *impl()->ad_hoc_test_result();
-}
-
-// Gets the i-th test case among all the test cases. i can range from 0 to
-// total_test_case_count() - 1. If i is not in that range, returns NULL.
-TestCase* UnitTest::GetMutableTestCase(int i) {
-  return impl()->GetMutableTestCase(i);
-}
-
-// Returns the list of event listeners that can be used to track events
-// inside Google Test.
-TestEventListeners& UnitTest::listeners() {
-  return *impl()->listeners();
-}
-
-// Registers and returns a global test environment.  When a test
-// program is run, all global test environments will be set-up in the
-// order they were registered.  After all tests in the program have
-// finished, all global test environments will be torn-down in the
-// *reverse* order they were registered.
-//
-// The UnitTest object takes ownership of the given environment.
-//
-// We don't protect this under mutex_, as we only support calling it
-// from the main thread.
-Environment* UnitTest::AddEnvironment(Environment* env) {
-  if (env == NULL) {
-    return NULL;
-  }
-
-  impl_->environments().push_back(env);
-  return env;
-}
-
-// Adds a TestPartResult to the current TestResult object.  All Google Test
-// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
-// this to report their results.  The user code should use the
-// assertion macros instead of calling this directly.
-void UnitTest::AddTestPartResult(
-    TestPartResult::Type result_type,
-    const char* file_name,
-    int line_number,
-    const std::string& message,
-    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
-  Message msg;
-  msg << message;
-
-  internal::MutexLock lock(&mutex_);
-  if (impl_->gtest_trace_stack().size() > 0) {
-    msg << "\n" << GTEST_NAME_ << " trace:";
-
-    for (int i = static_cast<int>(impl_->gtest_trace_stack().size());
-         i > 0; --i) {
-      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
-      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
-          << " " << trace.message;
-    }
-  }
-
-  if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) {
-    msg << internal::kStackTraceMarker << os_stack_trace;
-  }
-
-  const TestPartResult result =
-    TestPartResult(result_type, file_name, line_number,
-                   msg.GetString().c_str());
-  impl_->GetTestPartResultReporterForCurrentThread()->
-      ReportTestPartResult(result);
-
-  if (result_type != TestPartResult::kSuccess) {
-    // gtest_break_on_failure takes precedence over
-    // gtest_throw_on_failure.  This allows a user to set the latter
-    // in the code (perhaps in order to use Google Test assertions
-    // with another testing framework) and specify the former on the
-    // command line for debugging.
-    if (GTEST_FLAG(break_on_failure)) {
-#if GTEST_OS_WINDOWS
-      // Using DebugBreak on Windows allows gtest to still break into a debugger
-      // when a failure happens and both the --gtest_break_on_failure and
-      // the --gtest_catch_exceptions flags are specified.
-      DebugBreak();
-#else
-      // Dereference NULL through a volatile pointer to prevent the compiler
-      // from removing. We use this rather than abort() or __builtin_trap() for
-      // portability: Symbian doesn't implement abort() well, and some debuggers
-      // don't correctly trap abort().
-      *static_cast<volatile int*>(NULL) = 1;
-#endif  // GTEST_OS_WINDOWS
-    } else if (GTEST_FLAG(throw_on_failure)) {
-#if GTEST_HAS_EXCEPTIONS
-      throw internal::GoogleTestFailureException(result);
-#else
-      // We cannot call abort() as it generates a pop-up in debug mode
-      // that cannot be suppressed in VC 7.1 or below.
-      exit(1);
-#endif
-    }
-  }
-}
-
-// Adds a TestProperty to the current TestResult object when invoked from
-// inside a test, to current TestCase's ad_hoc_test_result_ when invoked
-// from SetUpTestCase or TearDownTestCase, or to the global property set
-// when invoked elsewhere.  If the result already contains a property with
-// the same key, the value will be updated.
-void UnitTest::RecordProperty(const std::string& key,
-                              const std::string& value) {
-  impl_->RecordProperty(TestProperty(key, value));
-}
-
-// Runs all tests in this UnitTest object and prints the result.
-// Returns 0 if successful, or 1 otherwise.
-//
-// We don't protect this under mutex_, as we only support calling it
-// from the main thread.
-int UnitTest::Run() {
-  const bool in_death_test_child_process =
-      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
-
-  // Google Test implements this protocol for catching that a test
-  // program exits before returning control to Google Test:
-  //
-  //   1. Upon start, Google Test creates a file whose absolute path
-  //      is specified by the environment variable
-  //      TEST_PREMATURE_EXIT_FILE.
-  //   2. When Google Test has finished its work, it deletes the file.
-  //
-  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
-  // running a Google-Test-based test program and check the existence
-  // of the file at the end of the test execution to see if it has
-  // exited prematurely.
-
-  // If we are in the child process of a death test, don't
-  // create/delete the premature exit file, as doing so is unnecessary
-  // and will confuse the parent process.  Otherwise, create/delete
-  // the file upon entering/leaving this function.  If the program
-  // somehow exits before this function has a chance to return, the
-  // premature-exit file will be left undeleted, causing a test runner
-  // that understands the premature-exit-file protocol to report the
-  // test as having failed.
-  const internal::ScopedPrematureExitFile premature_exit_file(
-      in_death_test_child_process ?
-      NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
-
-  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
-  // used for the duration of the program.
-  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
-
-#if GTEST_HAS_SEH
-  // Either the user wants Google Test to catch exceptions thrown by the
-  // tests or this is executing in the context of death test child
-  // process. In either case the user does not want to see pop-up dialogs
-  // about crashes - they are expected.
-  if (impl()->catch_exceptions() || in_death_test_child_process) {
-# if !GTEST_OS_WINDOWS_MOBILE
-    // SetErrorMode doesn't exist on CE.
-    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
-                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
-# endif  // !GTEST_OS_WINDOWS_MOBILE
-
-# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
-    // Death test children can be terminated with _abort().  On Windows,
-    // _abort() can show a dialog with a warning message.  This forces the
-    // abort message to go to stderr instead.
-    _set_error_mode(_OUT_TO_STDERR);
-# endif
-
-# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
-    // In the debug version, Visual Studio pops up a separate dialog
-    // offering a choice to debug the aborted program. We need to suppress
-    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
-    // executed. Google Test will notify the user of any unexpected
-    // failure via stderr.
-    //
-    // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
-    // Users of prior VC versions shall suffer the agony and pain of
-    // clicking through the countless debug dialogs.
-    // TODO(vladl@google.com): find a way to suppress the abort dialog() in the
-    // debug mode when compiled with VC 7.1 or lower.
-    if (!GTEST_FLAG(break_on_failure))
-      _set_abort_behavior(
-          0x0,                                    // Clear the following flags:
-          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
-# endif
-  }
-#endif  // GTEST_HAS_SEH
-
-  return internal::HandleExceptionsInMethodIfSupported(
-      impl(),
-      &internal::UnitTestImpl::RunAllTests,
-      "auxiliary test code (environments or event listeners)") ? 0 : 1;
-}
-
-// Returns the working directory when the first TEST() or TEST_F() was
-// executed.
-const char* UnitTest::original_working_dir() const {
-  return impl_->original_working_dir_.c_str();
-}
-
-// Returns the TestCase object for the test that's currently running,
-// or NULL if no test is running.
-const TestCase* UnitTest::current_test_case() const
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  internal::MutexLock lock(&mutex_);
-  return impl_->current_test_case();
-}
-
-// Returns the TestInfo object for the test that's currently running,
-// or NULL if no test is running.
-const TestInfo* UnitTest::current_test_info() const
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  internal::MutexLock lock(&mutex_);
-  return impl_->current_test_info();
-}
-
-// Returns the random seed used at the start of the current test run.
-int UnitTest::random_seed() const { return impl_->random_seed(); }
-
-#if GTEST_HAS_PARAM_TEST
-// Returns ParameterizedTestCaseRegistry object used to keep track of
-// value-parameterized tests and instantiate and register them.
-internal::ParameterizedTestCaseRegistry&
-    UnitTest::parameterized_test_registry()
-        GTEST_LOCK_EXCLUDED_(mutex_) {
-  return impl_->parameterized_test_registry();
-}
-#endif  // GTEST_HAS_PARAM_TEST
-
-// Creates an empty UnitTest.
-UnitTest::UnitTest() {
-  impl_ = new internal::UnitTestImpl(this);
-}
-
-// Destructor of UnitTest.
-UnitTest::~UnitTest() {
-  delete impl_;
-}
-
-// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
-// Google Test trace stack.
-void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  internal::MutexLock lock(&mutex_);
-  impl_->gtest_trace_stack().push_back(trace);
-}
-
-// Pops a trace from the per-thread Google Test trace stack.
-void UnitTest::PopGTestTrace()
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  internal::MutexLock lock(&mutex_);
-  impl_->gtest_trace_stack().pop_back();
-}
-
-namespace internal {
-
-UnitTestImpl::UnitTestImpl(UnitTest* parent)
-    : parent_(parent),
-#ifdef _MSC_VER
-# pragma warning(push)                    // Saves the current warning state.
-# pragma warning(disable:4355)            // Temporarily disables warning 4355
-                                         // (using this in initializer).
-      default_global_test_part_result_reporter_(this),
-      default_per_thread_test_part_result_reporter_(this),
-# pragma warning(pop)                     // Restores the warning state again.
-#else
-      default_global_test_part_result_reporter_(this),
-      default_per_thread_test_part_result_reporter_(this),
-#endif  // _MSC_VER
-      global_test_part_result_repoter_(
-          &default_global_test_part_result_reporter_),
-      per_thread_test_part_result_reporter_(
-          &default_per_thread_test_part_result_reporter_),
-#if GTEST_HAS_PARAM_TEST
-      parameterized_test_registry_(),
-      parameterized_tests_registered_(false),
-#endif  // GTEST_HAS_PARAM_TEST
-      last_death_test_case_(-1),
-      current_test_case_(NULL),
-      current_test_info_(NULL),
-      ad_hoc_test_result_(),
-      os_stack_trace_getter_(NULL),
-      post_flag_parse_init_performed_(false),
-      random_seed_(0),  // Will be overridden by the flag before first use.
-      random_(0),  // Will be reseeded before first use.
-      start_timestamp_(0),
-      elapsed_time_(0),
-#if GTEST_HAS_DEATH_TEST
-      death_test_factory_(new DefaultDeathTestFactory),
-#endif
-      // Will be overridden by the flag before first use.
-      catch_exceptions_(false) {
-  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
-}
-
-UnitTestImpl::~UnitTestImpl() {
-  // Deletes every TestCase.
-  ForEach(test_cases_, internal::Delete<TestCase>);
-
-  // Deletes every Environment.
-  ForEach(environments_, internal::Delete<Environment>);
-
-  delete os_stack_trace_getter_;
-}
-
-// Adds a TestProperty to the current TestResult object when invoked in a
-// context of a test, to current test case's ad_hoc_test_result when invoke
-// from SetUpTestCase/TearDownTestCase, or to the global property set
-// otherwise.  If the result already contains a property with the same key,
-// the value will be updated.
-void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
-  std::string xml_element;
-  TestResult* test_result;  // TestResult appropriate for property recording.
-
-  if (current_test_info_ != NULL) {
-    xml_element = "testcase";
-    test_result = &(current_test_info_->result_);
-  } else if (current_test_case_ != NULL) {
-    xml_element = "testsuite";
-    test_result = &(current_test_case_->ad_hoc_test_result_);
-  } else {
-    xml_element = "testsuites";
-    test_result = &ad_hoc_test_result_;
-  }
-  test_result->RecordProperty(xml_element, test_property);
-}
-
-#if GTEST_HAS_DEATH_TEST
-// Disables event forwarding if the control is currently in a death test
-// subprocess. Must not be called before InitGoogleTest.
-void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
-  if (internal_run_death_test_flag_.get() != NULL)
-    listeners()->SuppressEventForwarding();
-}
-#endif  // GTEST_HAS_DEATH_TEST
-
-// Initializes event listeners performing XML output as specified by
-// UnitTestOptions. Must not be called before InitGoogleTest.
-void UnitTestImpl::ConfigureXmlOutput() {
-  const std::string& output_format = UnitTestOptions::GetOutputFormat();
-  if (output_format == "xml") {
-    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
-        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
-  } else if (output_format != "") {
-    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
-           output_format.c_str());
-    fflush(stdout);
-  }
-}
-
-#if GTEST_CAN_STREAM_RESULTS_
-// Initializes event listeners for streaming test results in string form.
-// Must not be called before InitGoogleTest.
-void UnitTestImpl::ConfigureStreamingOutput() {
-  const std::string& target = GTEST_FLAG(stream_result_to);
-  if (!target.empty()) {
-    const size_t pos = target.find(':');
-    if (pos != std::string::npos) {
-      listeners()->Append(new StreamingListener(target.substr(0, pos),
-                                                target.substr(pos+1)));
-    } else {
-      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
-             target.c_str());
-      fflush(stdout);
-    }
-  }
-}
-#endif  // GTEST_CAN_STREAM_RESULTS_
-
-// Performs initialization dependent upon flag values obtained in
-// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
-// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
-// this function is also called from RunAllTests.  Since this function can be
-// called more than once, it has to be idempotent.
-void UnitTestImpl::PostFlagParsingInit() {
-  // Ensures that this function does not execute more than once.
-  if (!post_flag_parse_init_performed_) {
-    post_flag_parse_init_performed_ = true;
-
-#if GTEST_HAS_DEATH_TEST
-    InitDeathTestSubprocessControlInfo();
-    SuppressTestEventsIfInSubprocess();
-#endif  // GTEST_HAS_DEATH_TEST
-
-    // Registers parameterized tests. This makes parameterized tests
-    // available to the UnitTest reflection API without running
-    // RUN_ALL_TESTS.
-    RegisterParameterizedTests();
-
-    // Configures listeners for XML output. This makes it possible for users
-    // to shut down the default XML output before invoking RUN_ALL_TESTS.
-    ConfigureXmlOutput();
-
-#if GTEST_CAN_STREAM_RESULTS_
-    // Configures listeners for streaming test results to the specified server.
-    ConfigureStreamingOutput();
-#endif  // GTEST_CAN_STREAM_RESULTS_
-  }
-}
-
-// A predicate that checks the name of a TestCase against a known
-// value.
-//
-// This is used for implementation of the UnitTest class only.  We put
-// it in the anonymous namespace to prevent polluting the outer
-// namespace.
-//
-// TestCaseNameIs is copyable.
-class TestCaseNameIs {
- public:
-  // Constructor.
-  explicit TestCaseNameIs(const std::string& name)
-      : name_(name) {}
-
-  // Returns true iff the name of test_case matches name_.
-  bool operator()(const TestCase* test_case) const {
-    return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0;
-  }
-
- private:
-  std::string name_;
-};
-
-// Finds and returns a TestCase with the given name.  If one doesn't
-// exist, creates one and returns it.  It's the CALLER'S
-// RESPONSIBILITY to ensure that this function is only called WHEN THE
-// TESTS ARE NOT SHUFFLED.
-//
-// Arguments:
-//
-//   test_case_name: name of the test case
-//   type_param:     the name of the test case's type parameter, or NULL if
-//                   this is not a typed or a type-parameterized test case.
-//   set_up_tc:      pointer to the function that sets up the test case
-//   tear_down_tc:   pointer to the function that tears down the test case
-TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
-                                    const char* type_param,
-                                    Test::SetUpTestCaseFunc set_up_tc,
-                                    Test::TearDownTestCaseFunc tear_down_tc) {
-  // Can we find a TestCase with the given name?
-  const std::vector<TestCase*>::const_iterator test_case =
-      std::find_if(test_cases_.begin(), test_cases_.end(),
-                   TestCaseNameIs(test_case_name));
-
-  if (test_case != test_cases_.end())
-    return *test_case;
-
-  // No.  Let's create one.
-  TestCase* const new_test_case =
-      new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc);
-
-  // Is this a death test case?
-  if (internal::UnitTestOptions::MatchesFilter(test_case_name,
-                                               kDeathTestCaseFilter)) {
-    // Yes.  Inserts the test case after the last death test case
-    // defined so far.  This only works when the test cases haven't
-    // been shuffled.  Otherwise we may end up running a death test
-    // after a non-death test.
-    ++last_death_test_case_;
-    test_cases_.insert(test_cases_.begin() + last_death_test_case_,
-                       new_test_case);
-  } else {
-    // No.  Appends to the end of the list.
-    test_cases_.push_back(new_test_case);
-  }
-
-  test_case_indices_.push_back(static_cast<int>(test_case_indices_.size()));
-  return new_test_case;
-}
-
-// Helpers for setting up / tearing down the given environment.  They
-// are for use in the ForEach() function.
-static void SetUpEnvironment(Environment* env) { env->SetUp(); }
-static void TearDownEnvironment(Environment* env) { env->TearDown(); }
-
-// Runs all tests in this UnitTest object, prints the result, and
-// returns true if all tests are successful.  If any exception is
-// thrown during a test, the test is considered to be failed, but the
-// rest of the tests will still be run.
-//
-// When parameterized tests are enabled, it expands and registers
-// parameterized tests first in RegisterParameterizedTests().
-// All other functions called from RunAllTests() may safely assume that
-// parameterized tests are ready to be counted and run.
-bool UnitTestImpl::RunAllTests() {
-  // Makes sure InitGoogleTest() was called.
-  if (!GTestIsInitialized()) {
-    printf("%s",
-           "\nThis test program did NOT call ::testing::InitGoogleTest "
-           "before calling RUN_ALL_TESTS().  Please fix it.\n");
-    return false;
-  }
-
-  // Do not run any test if the --help flag was specified.
-  if (g_help_flag)
-    return true;
-
-  // Repeats the call to the post-flag parsing initialization in case the
-  // user didn't call InitGoogleTest.
-  PostFlagParsingInit();
-
-  // Even if sharding is not on, test runners may want to use the
-  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
-  // protocol.
-  internal::WriteToShardStatusFileIfNeeded();
-
-  // True iff we are in a subprocess for running a thread-safe-style
-  // death test.
-  bool in_subprocess_for_death_test = false;
-
-#if GTEST_HAS_DEATH_TEST
-  in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
-#endif  // GTEST_HAS_DEATH_TEST
-
-  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
-                                        in_subprocess_for_death_test);
-
-  // Compares the full test names with the filter to decide which
-  // tests to run.
-  const bool has_tests_to_run = FilterTests(should_shard
-                                              ? HONOR_SHARDING_PROTOCOL
-                                              : IGNORE_SHARDING_PROTOCOL) > 0;
-
-  // Lists the tests and exits if the --gtest_list_tests flag was specified.
-  if (GTEST_FLAG(list_tests)) {
-    // This must be called *after* FilterTests() has been called.
-    ListTestsMatchingFilter();
-    return true;
-  }
-
-  random_seed_ = GTEST_FLAG(shuffle) ?
-      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
-
-  // True iff at least one test has failed.
-  bool failed = false;
-
-  TestEventListener* repeater = listeners()->repeater();
-
-  start_timestamp_ = GetTimeInMillis();
-  repeater->OnTestProgramStart(*parent_);
-
-  // How many times to repeat the tests?  We don't want to repeat them
-  // when we are inside the subprocess of a death test.
-  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
-  // Repeats forever if the repeat count is negative.
-  const bool forever = repeat < 0;
-  for (int i = 0; forever || i != repeat; i++) {
-    // We want to preserve failures generated by ad-hoc test
-    // assertions executed before RUN_ALL_TESTS().
-    ClearNonAdHocTestResult();
-
-    const TimeInMillis start = GetTimeInMillis();
-
-    // Shuffles test cases and tests if requested.
-    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
-      random()->Reseed(random_seed_);
-      // This should be done before calling OnTestIterationStart(),
-      // such that a test event listener can see the actual test order
-      // in the event.
-      ShuffleTests();
-    }
-
-    // Tells the unit test event listeners that the tests are about to start.
-    repeater->OnTestIterationStart(*parent_, i);
-
-    // Runs each test case if there is at least one test to run.
-    if (has_tests_to_run) {
-      // Sets up all environments beforehand.
-      repeater->OnEnvironmentsSetUpStart(*parent_);
-      ForEach(environments_, SetUpEnvironment);
-      repeater->OnEnvironmentsSetUpEnd(*parent_);
-
-      // Runs the tests only if there was no fatal failure during global
-      // set-up.
-      if (!Test::HasFatalFailure()) {
-        for (int test_index = 0; test_index < total_test_case_count();
-             test_index++) {
-          GetMutableTestCase(test_index)->Run();
-        }
-      }
-
-      // Tears down all environments in reverse order afterwards.
-      repeater->OnEnvironmentsTearDownStart(*parent_);
-      std::for_each(environments_.rbegin(), environments_.rend(),
-                    TearDownEnvironment);
-      repeater->OnEnvironmentsTearDownEnd(*parent_);
-    }
-
-    elapsed_time_ = GetTimeInMillis() - start;
-
-    // Tells the unit test event listener that the tests have just finished.
-    repeater->OnTestIterationEnd(*parent_, i);
-
-    // Gets the result and clears it.
-    if (!Passed()) {
-      failed = true;
-    }
-
-    // Restores the original test order after the iteration.  This
-    // allows the user to quickly repro a failure that happens in the
-    // N-th iteration without repeating the first (N - 1) iterations.
-    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
-    // case the user somehow changes the value of the flag somewhere
-    // (it's always safe to unshuffle the tests).
-    UnshuffleTests();
-
-    if (GTEST_FLAG(shuffle)) {
-      // Picks a new random seed for each iteration.
-      random_seed_ = GetNextRandomSeed(random_seed_);
-    }
-  }
-
-  repeater->OnTestProgramEnd(*parent_);
-
-  return !failed;
-}
-
-// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
-// if the variable is present. If a file already exists at this location, this
-// function will write over it. If the variable is present, but the file cannot
-// be created, prints an error and exits.
-void WriteToShardStatusFileIfNeeded() {
-  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
-  if (test_shard_file != NULL) {
-    FILE* const file = posix::FOpen(test_shard_file, "w");
-    if (file == NULL) {
-      ColoredPrintf(COLOR_RED,
-                    "Could not write to the test shard status file \"%s\" "
-                    "specified by the %s environment variable.\n",
-                    test_shard_file, kTestShardStatusFile);
-      fflush(stdout);
-      exit(EXIT_FAILURE);
-    }
-    fclose(file);
-  }
-}
-
-// Checks whether sharding is enabled by examining the relevant
-// environment variable values. If the variables are present,
-// but inconsistent (i.e., shard_index >= total_shards), prints
-// an error and exits. If in_subprocess_for_death_test, sharding is
-// disabled because it must only be applied to the original test
-// process. Otherwise, we could filter out death tests we intended to execute.
-bool ShouldShard(const char* total_shards_env,
-                 const char* shard_index_env,
-                 bool in_subprocess_for_death_test) {
-  if (in_subprocess_for_death_test) {
-    return false;
-  }
-
-  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
-  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
-
-  if (total_shards == -1 && shard_index == -1) {
-    return false;
-  } else if (total_shards == -1 && shard_index != -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestShardIndex << " = " << shard_index
-      << ", but have left " << kTestTotalShards << " unset.\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  } else if (total_shards != -1 && shard_index == -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestTotalShards << " = " << total_shards
-      << ", but have left " << kTestShardIndex << " unset.\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  } else if (shard_index < 0 || shard_index >= total_shards) {
-    const Message msg = Message()
-      << "Invalid environment variables: we require 0 <= "
-      << kTestShardIndex << " < " << kTestTotalShards
-      << ", but you have " << kTestShardIndex << "=" << shard_index
-      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  }
-
-  return total_shards > 1;
-}
-
-// Parses the environment variable var as an Int32. If it is unset,
-// returns default_val. If it is not an Int32, prints an error
-// and aborts.
-Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
-  const char* str_val = posix::GetEnv(var);
-  if (str_val == NULL) {
-    return default_val;
-  }
-
-  Int32 result;
-  if (!ParseInt32(Message() << "The value of environment variable " << var,
-                  str_val, &result)) {
-    exit(EXIT_FAILURE);
-  }
-  return result;
-}
-
-// Given the total number of shards, the shard index, and the test id,
-// returns true iff the test should be run on this shard. The test id is
-// some arbitrary but unique non-negative integer assigned to each test
-// method. Assumes that 0 <= shard_index < total_shards.
-bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
-  return (test_id % total_shards) == shard_index;
-}
-
-// Compares the name of each test with the user-specified filter to
-// decide whether the test should be run, then records the result in
-// each TestCase and TestInfo object.
-// If shard_tests == true, further filters tests based on sharding
-// variables in the environment - see
-// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
-// Returns the number of tests that should run.
-int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
-  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
-  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
-
-  // num_runnable_tests are the number of tests that will
-  // run across all shards (i.e., match filter and are not disabled).
-  // num_selected_tests are the number of tests to be run on
-  // this shard.
-  int num_runnable_tests = 0;
-  int num_selected_tests = 0;
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    TestCase* const test_case = test_cases_[i];
-    const std::string &test_case_name = test_case->name();
-    test_case->set_should_run(false);
-
-    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
-      TestInfo* const test_info = test_case->test_info_list()[j];
-      const std::string test_name(test_info->name());
-      // A test is disabled if test case name or test name matches
-      // kDisableTestFilter.
-      const bool is_disabled =
-          internal::UnitTestOptions::MatchesFilter(test_case_name,
-                                                   kDisableTestFilter) ||
-          internal::UnitTestOptions::MatchesFilter(test_name,
-                                                   kDisableTestFilter);
-      test_info->is_disabled_ = is_disabled;
-
-      const bool matches_filter =
-          internal::UnitTestOptions::FilterMatchesTest(test_case_name,
-                                                       test_name);
-      test_info->matches_filter_ = matches_filter;
-
-      const bool is_runnable =
-          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
-          matches_filter;
-
-      const bool is_selected = is_runnable &&
-          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
-           ShouldRunTestOnShard(total_shards, shard_index,
-                                num_runnable_tests));
-
-      num_runnable_tests += is_runnable;
-      num_selected_tests += is_selected;
-
-      test_info->should_run_ = is_selected;
-      test_case->set_should_run(test_case->should_run() || is_selected);
-    }
-  }
-  return num_selected_tests;
-}
-
-// Prints the given C-string on a single line by replacing all '\n'
-// characters with string "\\n".  If the output takes more than
-// max_length characters, only prints the first max_length characters
-// and "...".
-static void PrintOnOneLine(const char* str, int max_length) {
-  if (str != NULL) {
-    for (int i = 0; *str != '\0'; ++str) {
-      if (i >= max_length) {
-        printf("...");
-        break;
-      }
-      if (*str == '\n') {
-        printf("\\n");
-        i += 2;
-      } else {
-        printf("%c", *str);
-        ++i;
-      }
-    }
-  }
-}
-
-// Prints the names of the tests matching the user-specified filter flag.
-void UnitTestImpl::ListTestsMatchingFilter() {
-  // Print at most this many characters for each type/value parameter.
-  const int kMaxParamLength = 250;
-
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    const TestCase* const test_case = test_cases_[i];
-    bool printed_test_case_name = false;
-
-    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
-      const TestInfo* const test_info =
-          test_case->test_info_list()[j];
-      if (test_info->matches_filter_) {
-        if (!printed_test_case_name) {
-          printed_test_case_name = true;
-          printf("%s.", test_case->name());
-          if (test_case->type_param() != NULL) {
-            printf("  # %s = ", kTypeParamLabel);
-            // We print the type parameter on a single line to make
-            // the output easy to parse by a program.
-            PrintOnOneLine(test_case->type_param(), kMaxParamLength);
-          }
-          printf("\n");
-        }
-        printf("  %s", test_info->name());
-        if (test_info->value_param() != NULL) {
-          printf("  # %s = ", kValueParamLabel);
-          // We print the value parameter on a single line to make the
-          // output easy to parse by a program.
-          PrintOnOneLine(test_info->value_param(), kMaxParamLength);
-        }
-        printf("\n");
-      }
-    }
-  }
-  fflush(stdout);
-}
-
-// Sets the OS stack trace getter.
-//
-// Does nothing if the input and the current OS stack trace getter are
-// the same; otherwise, deletes the old getter and makes the input the
-// current getter.
-void UnitTestImpl::set_os_stack_trace_getter(
-    OsStackTraceGetterInterface* getter) {
-  if (os_stack_trace_getter_ != getter) {
-    delete os_stack_trace_getter_;
-    os_stack_trace_getter_ = getter;
-  }
-}
-
-// Returns the current OS stack trace getter if it is not NULL;
-// otherwise, creates an OsStackTraceGetter, makes it the current
-// getter, and returns it.
-OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
-  if (os_stack_trace_getter_ == NULL) {
-    os_stack_trace_getter_ = new OsStackTraceGetter;
-  }
-
-  return os_stack_trace_getter_;
-}
-
-// Returns the TestResult for the test that's currently running, or
-// the TestResult for the ad hoc test if no test is running.
-TestResult* UnitTestImpl::current_test_result() {
-  return current_test_info_ ?
-      &(current_test_info_->result_) : &ad_hoc_test_result_;
-}
-
-// Shuffles all test cases, and the tests within each test case,
-// making sure that death tests are still run first.
-void UnitTestImpl::ShuffleTests() {
-  // Shuffles the death test cases.
-  ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_);
-
-  // Shuffles the non-death test cases.
-  ShuffleRange(random(), last_death_test_case_ + 1,
-               static_cast<int>(test_cases_.size()), &test_case_indices_);
-
-  // Shuffles the tests inside each test case.
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    test_cases_[i]->ShuffleTests(random());
-  }
-}
-
-// Restores the test cases and tests to their order before the first shuffle.
-void UnitTestImpl::UnshuffleTests() {
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    // Unshuffles the tests in each test case.
-    test_cases_[i]->UnshuffleTests();
-    // Resets the index of each test case.
-    test_case_indices_[i] = static_cast<int>(i);
-  }
-}
-
-// Returns the current OS stack trace as an std::string.
-//
-// The maximum number of stack frames to be included is specified by
-// the gtest_stack_trace_depth flag.  The skip_count parameter
-// specifies the number of top frames to be skipped, which doesn't
-// count against the number of frames to be included.
-//
-// For example, if Foo() calls Bar(), which in turn calls
-// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
-// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
-                                            int skip_count) {
-  // We pass skip_count + 1 to skip this wrapper function in addition
-  // to what the user really wants to skip.
-  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
-}
-
-// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
-// suppress unreachable code warnings.
-namespace {
-class ClassUniqueToAlwaysTrue {};
-}
-
-bool IsTrue(bool condition) { return condition; }
-
-bool AlwaysTrue() {
-#if GTEST_HAS_EXCEPTIONS
-  // This condition is always false so AlwaysTrue() never actually throws,
-  // but it makes the compiler think that it may throw.
-  if (IsTrue(false))
-    throw ClassUniqueToAlwaysTrue();
-#endif  // GTEST_HAS_EXCEPTIONS
-  return true;
-}
-
-// If *pstr starts with the given prefix, modifies *pstr to be right
-// past the prefix and returns true; otherwise leaves *pstr unchanged
-// and returns false.  None of pstr, *pstr, and prefix can be NULL.
-bool SkipPrefix(const char* prefix, const char** pstr) {
-  const size_t prefix_len = strlen(prefix);
-  if (strncmp(*pstr, prefix, prefix_len) == 0) {
-    *pstr += prefix_len;
-    return true;
-  }
-  return false;
-}
-
-// Parses a string as a command line flag.  The string should have
-// the format "--flag=value".  When def_optional is true, the "=value"
-// part can be omitted.
-//
-// Returns the value of the flag, or NULL if the parsing failed.
-const char* ParseFlagValue(const char* str,
-                           const char* flag,
-                           bool def_optional) {
-  // str and flag must not be NULL.
-  if (str == NULL || flag == NULL) return NULL;
-
-  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
-  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
-  const size_t flag_len = flag_str.length();
-  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
-
-  // Skips the flag name.
-  const char* flag_end = str + flag_len;
-
-  // When def_optional is true, it's OK to not have a "=value" part.
-  if (def_optional && (flag_end[0] == '\0')) {
-    return flag_end;
-  }
-
-  // If def_optional is true and there are more characters after the
-  // flag name, or if def_optional is false, there must be a '=' after
-  // the flag name.
-  if (flag_end[0] != '=') return NULL;
-
-  // Returns the string after "=".
-  return flag_end + 1;
-}
-
-// Parses a string for a bool flag, in the form of either
-// "--flag=value" or "--flag".
-//
-// In the former case, the value is taken as true as long as it does
-// not start with '0', 'f', or 'F'.
-//
-// In the latter case, the value is taken as true.
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, true);
-
-  // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
-
-  // Converts the string value to a bool.
-  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
-  return true;
-}
-
-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
-
-  // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
-
-  // Sets *value to the value of the flag.
-  return ParseInt32(Message() << "The value of flag --" << flag,
-                    value_str, value);
-}
-
-// Parses a string for a string flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
-
-  // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
-
-  // Sets *value to the value of the flag.
-  *value = value_str;
-  return true;
-}
-
-// Determines whether a string has a prefix that Google Test uses for its
-// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
-// If Google Test detects that a command line flag has its prefix but is not
-// recognized, it will print its help message. Flags starting with
-// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
-// internal flags and do not trigger the help message.
-static bool HasGoogleTestFlagPrefix(const char* str) {
-  return (SkipPrefix("--", &str) ||
-          SkipPrefix("-", &str) ||
-          SkipPrefix("/", &str)) &&
-         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
-         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
-          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
-}
-
-// Prints a string containing code-encoded text.  The following escape
-// sequences can be used in the string to control the text color:
-//
-//   @@    prints a single '@' character.
-//   @R    changes the color to red.
-//   @G    changes the color to green.
-//   @Y    changes the color to yellow.
-//   @D    changes to the default terminal text color.
-//
-// TODO(wan@google.com): Write tests for this once we add stdout
-// capturing to Google Test.
-static void PrintColorEncoded(const char* str) {
-  GTestColor color = COLOR_DEFAULT;  // The current color.
-
-  // Conceptually, we split the string into segments divided by escape
-  // sequences.  Then we print one segment at a time.  At the end of
-  // each iteration, the str pointer advances to the beginning of the
-  // next segment.
-  for (;;) {
-    const char* p = strchr(str, '@');
-    if (p == NULL) {
-      ColoredPrintf(color, "%s", str);
-      return;
-    }
-
-    ColoredPrintf(color, "%s", std::string(str, p).c_str());
-
-    const char ch = p[1];
-    str = p + 2;
-    if (ch == '@') {
-      ColoredPrintf(color, "@");
-    } else if (ch == 'D') {
-      color = COLOR_DEFAULT;
-    } else if (ch == 'R') {
-      color = COLOR_RED;
-    } else if (ch == 'G') {
-      color = COLOR_GREEN;
-    } else if (ch == 'Y') {
-      color = COLOR_YELLOW;
-    } else {
-      --str;
-    }
-  }
-}
-
-static const char kColorEncodedHelpMessage[] =
-"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
-"following command line flags to control its behavior:\n"
-"\n"
-"Test Selection:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
-"      List the names of all tests instead of running them. The name of\n"
-"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
-"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
-    "[@G-@YNEGATIVE_PATTERNS]@D\n"
-"      Run only the tests whose name matches one of the positive patterns but\n"
-"      none of the negative patterns. '?' matches any single character; '*'\n"
-"      matches any substring; ':' separates two patterns.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
-"      Run all disabled tests too.\n"
-"\n"
-"Test Execution:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
-"      Run the tests repeatedly; use a negative count to repeat forever.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
-"      Randomize tests' orders on every iteration.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
-"      Random number seed to use for shuffling test orders (between 1 and\n"
-"      99999, or 0 to use a seed based on the current time).\n"
-"\n"
-"Test Output:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
-"      Enable/disable colored output. The default is @Gauto@D.\n"
-"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
-"      Don't print the elapsed time of each test.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G"
-    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
-"      Generate an XML report in the given directory or with the given file\n"
-"      name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
-#if GTEST_CAN_STREAM_RESULTS_
-"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
-"      Stream test results to the given server.\n"
-#endif  // GTEST_CAN_STREAM_RESULTS_
-"\n"
-"Assertion Behavior:\n"
-#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
-"      Set the default death test style.\n"
-#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
-"      Turn assertion failures into debugger break-points.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
-"      Turn assertion failures into C++ exceptions.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
-"      Do not report exceptions as test failures. Instead, allow them\n"
-"      to crash the program or throw a pop-up (on Windows).\n"
-"\n"
-"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
-    "the corresponding\n"
-"environment variable of a flag (all letters in upper-case). For example, to\n"
-"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
-    "color=no@D or set\n"
-"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
-"\n"
-"For more information, please read the " GTEST_NAME_ " documentation at\n"
-"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
-"(not one in your own code or tests), please report it to\n"
-"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.  The type parameter CharType can be
-// instantiated to either char or wchar_t.
-template <typename CharType>
-void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
-  for (int i = 1; i < *argc; i++) {
-    const std::string arg_string = StreamableToString(argv[i]);
-    const char* const arg = arg_string.c_str();
-
-    using internal::ParseBoolFlag;
-    using internal::ParseInt32Flag;
-    using internal::ParseStringFlag;
-
-    // Do we see a Google Test flag?
-    if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
-                      &GTEST_FLAG(also_run_disabled_tests)) ||
-        ParseBoolFlag(arg, kBreakOnFailureFlag,
-                      &GTEST_FLAG(break_on_failure)) ||
-        ParseBoolFlag(arg, kCatchExceptionsFlag,
-                      &GTEST_FLAG(catch_exceptions)) ||
-        ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-        ParseStringFlag(arg, kDeathTestStyleFlag,
-                        &GTEST_FLAG(death_test_style)) ||
-        ParseBoolFlag(arg, kDeathTestUseFork,
-                      &GTEST_FLAG(death_test_use_fork)) ||
-        ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-        ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                        &GTEST_FLAG(internal_run_death_test)) ||
-        ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-        ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-        ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-        ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-        ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-        ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-        ParseInt32Flag(arg, kStackTraceDepthFlag,
-                       &GTEST_FLAG(stack_trace_depth)) ||
-        ParseStringFlag(arg, kStreamResultToFlag,
-                        &GTEST_FLAG(stream_result_to)) ||
-        ParseBoolFlag(arg, kThrowOnFailureFlag,
-                      &GTEST_FLAG(throw_on_failure))
-        ) {
-      // Yes.  Shift the remainder of the argv list left by one.  Note
-      // that argv has (*argc + 1) elements, the last one always being
-      // NULL.  The following loop moves the trailing NULL element as
-      // well.
-      for (int j = i; j != *argc; j++) {
-        argv[j] = argv[j + 1];
-      }
-
-      // Decrements the argument count.
-      (*argc)--;
-
-      // We also need to decrement the iterator as we just removed
-      // an element.
-      i--;
-    } else if (arg_string == "--help" || arg_string == "-h" ||
-               arg_string == "-?" || arg_string == "/?" ||
-               HasGoogleTestFlagPrefix(arg)) {
-      // Both help flag and unrecognized Google Test flags (excluding
-      // internal ones) trigger help display.
-      g_help_flag = true;
-    }
-  }
-
-  if (g_help_flag) {
-    // We print the help here instead of in RUN_ALL_TESTS(), as the
-    // latter may not be called at all if the user is using Google
-    // Test with another testing framework.
-    PrintColorEncoded(kColorEncodedHelpMessage);
-  }
-}
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.
-void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
-  ParseGoogleTestFlagsOnlyImpl(argc, argv);
-}
-void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
-  ParseGoogleTestFlagsOnlyImpl(argc, argv);
-}
-
-// The internal implementation of InitGoogleTest().
-//
-// The type parameter CharType can be instantiated to either char or
-// wchar_t.
-template <typename CharType>
-void InitGoogleTestImpl(int* argc, CharType** argv) {
-  g_init_gtest_count++;
-
-  // We don't want to run the initialization code twice.
-  if (g_init_gtest_count != 1) return;
-
-  if (*argc <= 0) return;
-
-  internal::g_executable_path = internal::StreamableToString(argv[0]);
-
-#if GTEST_HAS_DEATH_TEST
-
-  g_argvs.clear();
-  for (int i = 0; i != *argc; i++) {
-    g_argvs.push_back(StreamableToString(argv[i]));
-  }
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-  ParseGoogleTestFlagsOnly(argc, argv);
-  GetUnitTestImpl()->PostFlagParsingInit();
-}
-
-}  // namespace internal
-
-// Initializes Google Test.  This must be called before calling
-// RUN_ALL_TESTS().  In particular, it parses a command line for the
-// flags that Google Test recognizes.  Whenever a Google Test flag is
-// seen, it is removed from argv, and *argc is decremented.
-//
-// No value is returned.  Instead, the Google Test flag variables are
-// updated.
-//
-// Calling the function for the second time has no user-visible effect.
-void InitGoogleTest(int* argc, char** argv) {
-  internal::InitGoogleTestImpl(argc, argv);
-}
-
-// This overloaded version can be used in Windows programs compiled in
-// UNICODE mode.
-void InitGoogleTest(int* argc, wchar_t** argv) {
-  internal::InitGoogleTestImpl(argc, argv);
-}
-
-}  // namespace testing
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev)
-//
-// This file implements death tests.
-
-
-#if GTEST_HAS_DEATH_TEST
-
-# if GTEST_OS_MAC
-#  include <crt_externs.h>
-# endif  // GTEST_OS_MAC
-
-# include <errno.h>
-# include <fcntl.h>
-# include <limits.h>
-
-# if GTEST_OS_LINUX
-#  include <signal.h>
-# endif  // GTEST_OS_LINUX
-
-# include <stdarg.h>
-
-# if GTEST_OS_WINDOWS
-#  include <windows.h>
-# else
-#  include <sys/mman.h>
-#  include <sys/wait.h>
-# endif  // GTEST_OS_WINDOWS
-
-# if GTEST_OS_QNX
-#  include <spawn.h>
-# endif  // GTEST_OS_QNX
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
-
-namespace testing {
-
-// Constants.
-
-// The default death test style.
-static const char kDefaultDeathTestStyle[] = "fast";
-
-GTEST_DEFINE_string_(
-    death_test_style,
-    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
-    "Indicates how to run a death test in a forked child process: "
-    "\"threadsafe\" (child process re-executes the test binary "
-    "from the beginning, running only the specific death test) or "
-    "\"fast\" (child process runs the death test immediately "
-    "after forking).");
-
-GTEST_DEFINE_bool_(
-    death_test_use_fork,
-    internal::BoolFromGTestEnv("death_test_use_fork", false),
-    "Instructs to use fork()/_exit() instead of clone() in death tests. "
-    "Ignored and always uses fork() on POSIX systems where clone() is not "
-    "implemented. Useful when running under valgrind or similar tools if "
-    "those do not support clone(). Valgrind 3.3.1 will just fail if "
-    "it sees an unsupported combination of clone() flags. "
-    "It is not recommended to use this flag w/o valgrind though it will "
-    "work in 99% of the cases. Once valgrind is fixed, this flag will "
-    "most likely be removed.");
-
-namespace internal {
-GTEST_DEFINE_string_(
-    internal_run_death_test, "",
-    "Indicates the file, line number, temporal index of "
-    "the single death test to run, and a file descriptor to "
-    "which a success code may be sent, all separated by "
-    "the '|' characters.  This flag is specified if and only if the current "
-    "process is a sub-process launched for running a thread-safe "
-    "death test.  FOR INTERNAL USE ONLY.");
-}  // namespace internal
-
-#if GTEST_HAS_DEATH_TEST
-
-namespace internal {
-
-# if !GTEST_OS_WINDOWS
-// Valid only for fast death tests. Indicates the code is running in the
-// child process of a fast style death test.
-static bool g_in_fast_death_test_child = false;
-# endif  // !GTEST_OS_WINDOWS
-
-// Returns a Boolean value indicating whether the caller is currently
-// executing in the context of the death test child process.  Tools such as
-// Valgrind heap checkers may need this to modify their behavior in death
-// tests.  IMPORTANT: This is an internal utility.  Using it may break the
-// implementation of death tests.  User code MUST NOT use it.
-bool InDeathTestChild() {
-# if GTEST_OS_WINDOWS
-
-  // On Windows, death tests are thread-safe regardless of the value of the
-  // death_test_style flag.
-  return !GTEST_FLAG(internal_run_death_test).empty();
-
-# else
-
-  if (GTEST_FLAG(death_test_style) == "threadsafe")
-    return !GTEST_FLAG(internal_run_death_test).empty();
-  else
-    return g_in_fast_death_test_child;
-#endif
-}
-
-}  // namespace internal
-
-// ExitedWithCode constructor.
-ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
-}
-
-// ExitedWithCode function-call operator.
-bool ExitedWithCode::operator()(int exit_status) const {
-# if GTEST_OS_WINDOWS
-
-  return exit_status == exit_code_;
-
-# else
-
-  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
-
-# endif  // GTEST_OS_WINDOWS
-}
-
-# if !GTEST_OS_WINDOWS
-// KilledBySignal constructor.
-KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
-}
-
-// KilledBySignal function-call operator.
-bool KilledBySignal::operator()(int exit_status) const {
-  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
-}
-# endif  // !GTEST_OS_WINDOWS
-
-namespace internal {
-
-// Utilities needed for death tests.
-
-// Generates a textual description of a given exit code, in the format
-// specified by wait(2).
-static std::string ExitSummary(int exit_code) {
-  Message m;
-
-# if GTEST_OS_WINDOWS
-
-  m << "Exited with exit status " << exit_code;
-
-# else
-
-  if (WIFEXITED(exit_code)) {
-    m << "Exited with exit status " << WEXITSTATUS(exit_code);
-  } else if (WIFSIGNALED(exit_code)) {
-    m << "Terminated by signal " << WTERMSIG(exit_code);
-  }
-#  ifdef WCOREDUMP
-  if (WCOREDUMP(exit_code)) {
-    m << " (core dumped)";
-  }
-#  endif
-# endif  // GTEST_OS_WINDOWS
-
-  return m.GetString();
-}
-
-// Returns true if exit_status describes a process that was terminated
-// by a signal, or exited normally with a nonzero exit code.
-bool ExitedUnsuccessfully(int exit_status) {
-  return !ExitedWithCode(0)(exit_status);
-}
-
-# if !GTEST_OS_WINDOWS
-// Generates a textual failure message when a death test finds more than
-// one thread running, or cannot determine the number of threads, prior
-// to executing the given statement.  It is the responsibility of the
-// caller not to pass a thread_count of 1.
-static std::string DeathTestThreadWarning(size_t thread_count) {
-  Message msg;
-  msg << "Death tests use fork(), which is unsafe particularly"
-      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
-  if (thread_count == 0)
-    msg << "couldn't detect the number of threads.";
-  else
-    msg << "detected " << thread_count << " threads.";
-  return msg.GetString();
-}
-# endif  // !GTEST_OS_WINDOWS
-
-// Flag characters for reporting a death test that did not die.
-static const char kDeathTestLived = 'L';
-static const char kDeathTestReturned = 'R';
-static const char kDeathTestThrew = 'T';
-static const char kDeathTestInternalError = 'I';
-
-// An enumeration describing all of the possible ways that a death test can
-// conclude.  DIED means that the process died while executing the test
-// code; LIVED means that process lived beyond the end of the test code;
-// RETURNED means that the test statement attempted to execute a return
-// statement, which is not allowed; THREW means that the test statement
-// returned control by throwing an exception.  IN_PROGRESS means the test
-// has not yet concluded.
-// TODO(vladl@google.com): Unify names and possibly values for
-// AbortReason, DeathTestOutcome, and flag characters above.
-enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
-
-// Routine for aborting the program which is safe to call from an
-// exec-style death test child process, in which case the error
-// message is propagated back to the parent process.  Otherwise, the
-// message is simply printed to stderr.  In either case, the program
-// then exits with status 1.
-void DeathTestAbort(const std::string& message) {
-  // On a POSIX system, this function may be called from a threadsafe-style
-  // death test child process, which operates on a very small stack.  Use
-  // the heap for any additional non-minuscule memory requirements.
-  const InternalRunDeathTestFlag* const flag =
-      GetUnitTestImpl()->internal_run_death_test_flag();
-  if (flag != NULL) {
-    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
-    fputc(kDeathTestInternalError, parent);
-    fprintf(parent, "%s", message.c_str());
-    fflush(parent);
-    _exit(1);
-  } else {
-    fprintf(stderr, "%s", message.c_str());
-    fflush(stderr);
-    posix::Abort();
-  }
-}
-
-// A replacement for CHECK that calls DeathTestAbort if the assertion
-// fails.
-# define GTEST_DEATH_TEST_CHECK_(expression) \
-  do { \
-    if (!::testing::internal::IsTrue(expression)) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression); \
-    } \
-  } while (::testing::internal::AlwaysFalse())
-
-// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
-// evaluating any system call that fulfills two conditions: it must return
-// -1 on failure, and set errno to EINTR when it is interrupted and
-// should be tried again.  The macro expands to a loop that repeatedly
-// evaluates the expression as long as it evaluates to -1 and sets
-// errno to EINTR.  If the expression evaluates to -1 but errno is
-// something other than EINTR, DeathTestAbort is called.
-# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
-  do { \
-    int gtest_retval; \
-    do { \
-      gtest_retval = (expression); \
-    } while (gtest_retval == -1 && errno == EINTR); \
-    if (gtest_retval == -1) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression + " != -1"); \
-    } \
-  } while (::testing::internal::AlwaysFalse())
-
-// Returns the message describing the last system error in errno.
-std::string GetLastErrnoDescription() {
-    return errno == 0 ? "" : posix::StrError(errno);
-}
-
-// This is called from a death test parent process to read a failure
-// message from the death test child process and log it with the FATAL
-// severity. On Windows, the message is read from a pipe handle. On other
-// platforms, it is read from a file descriptor.
-static void FailFromInternalError(int fd) {
-  Message error;
-  char buffer[256];
-  int num_read;
-
-  do {
-    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
-      buffer[num_read] = '\0';
-      error << buffer;
-    }
-  } while (num_read == -1 && errno == EINTR);
-
-  if (num_read == 0) {
-    GTEST_LOG_(FATAL) << error.GetString();
-  } else {
-    const int last_error = errno;
-    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
-                      << GetLastErrnoDescription() << " [" << last_error << "]";
-  }
-}
-
-// Death test constructor.  Increments the running death test count
-// for the current test.
-DeathTest::DeathTest() {
-  TestInfo* const info = GetUnitTestImpl()->current_test_info();
-  if (info == NULL) {
-    DeathTestAbort("Cannot run a death test outside of a TEST or "
-                   "TEST_F construct");
-  }
-}
-
-// Creates and returns a death test by dispatching to the current
-// death test factory.
-bool DeathTest::Create(const char* statement, const RE* regex,
-                       const char* file, int line, DeathTest** test) {
-  return GetUnitTestImpl()->death_test_factory()->Create(
-      statement, regex, file, line, test);
-}
-
-const char* DeathTest::LastMessage() {
-  return last_death_test_message_.c_str();
-}
-
-void DeathTest::set_last_death_test_message(const std::string& message) {
-  last_death_test_message_ = message;
-}
-
-std::string DeathTest::last_death_test_message_;
-
-// Provides cross platform implementation for some death functionality.
-class DeathTestImpl : public DeathTest {
- protected:
-  DeathTestImpl(const char* a_statement, const RE* a_regex)
-      : statement_(a_statement),
-        regex_(a_regex),
-        spawned_(false),
-        status_(-1),
-        outcome_(IN_PROGRESS),
-        read_fd_(-1),
-        write_fd_(-1) {}
-
-  // read_fd_ is expected to be closed and cleared by a derived class.
-  ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
-
-  void Abort(AbortReason reason);
-  virtual bool Passed(bool status_ok);
-
-  const char* statement() const { return statement_; }
-  const RE* regex() const { return regex_; }
-  bool spawned() const { return spawned_; }
-  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
-  int status() const { return status_; }
-  void set_status(int a_status) { status_ = a_status; }
-  DeathTestOutcome outcome() const { return outcome_; }
-  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
-  int read_fd() const { return read_fd_; }
-  void set_read_fd(int fd) { read_fd_ = fd; }
-  int write_fd() const { return write_fd_; }
-  void set_write_fd(int fd) { write_fd_ = fd; }
-
-  // Called in the parent process only. Reads the result code of the death
-  // test child process via a pipe, interprets it to set the outcome_
-  // member, and closes read_fd_.  Outputs diagnostics and terminates in
-  // case of unexpected codes.
-  void ReadAndInterpretStatusByte();
-
- private:
-  // The textual content of the code this object is testing.  This class
-  // doesn't own this string and should not attempt to delete it.
-  const char* const statement_;
-  // The regular expression which test output must match.  DeathTestImpl
-  // doesn't own this object and should not attempt to delete it.
-  const RE* const regex_;
-  // True if the death test child process has been successfully spawned.
-  bool spawned_;
-  // The exit status of the child process.
-  int status_;
-  // How the death test concluded.
-  DeathTestOutcome outcome_;
-  // Descriptor to the read end of the pipe to the child process.  It is
-  // always -1 in the child process.  The child keeps its write end of the
-  // pipe in write_fd_.
-  int read_fd_;
-  // Descriptor to the child's write end of the pipe to the parent process.
-  // It is always -1 in the parent process.  The parent keeps its end of the
-  // pipe in read_fd_.
-  int write_fd_;
-};
-
-// Called in the parent process only. Reads the result code of the death
-// test child process via a pipe, interprets it to set the outcome_
-// member, and closes read_fd_.  Outputs diagnostics and terminates in
-// case of unexpected codes.
-void DeathTestImpl::ReadAndInterpretStatusByte() {
-  char flag;
-  int bytes_read;
-
-  // The read() here blocks until data is available (signifying the
-  // failure of the death test) or until the pipe is closed (signifying
-  // its success), so it's okay to call this in the parent before
-  // the child process has exited.
-  do {
-    bytes_read = posix::Read(read_fd(), &flag, 1);
-  } while (bytes_read == -1 && errno == EINTR);
-
-  if (bytes_read == 0) {
-    set_outcome(DIED);
-  } else if (bytes_read == 1) {
-    switch (flag) {
-      case kDeathTestReturned:
-        set_outcome(RETURNED);
-        break;
-      case kDeathTestThrew:
-        set_outcome(THREW);
-        break;
-      case kDeathTestLived:
-        set_outcome(LIVED);
-        break;
-      case kDeathTestInternalError:
-        FailFromInternalError(read_fd());  // Does not return.
-        break;
-      default:
-        GTEST_LOG_(FATAL) << "Death test child process reported "
-                          << "unexpected status byte ("
-                          << static_cast<unsigned int>(flag) << ")";
-    }
-  } else {
-    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
-                      << GetLastErrnoDescription();
-  }
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
-  set_read_fd(-1);
-}
-
-// Signals that the death test code which should have exited, didn't.
-// Should be called only in a death test child process.
-// Writes a status byte to the child's status file descriptor, then
-// calls _exit(1).
-void DeathTestImpl::Abort(AbortReason reason) {
-  // The parent process considers the death test to be a failure if
-  // it finds any data in our pipe.  So, here we write a single flag byte
-  // to the pipe, then exit.
-  const char status_ch =
-      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
-      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
-
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
-  // We are leaking the descriptor here because on some platforms (i.e.,
-  // when built as Windows DLL), destructors of global objects will still
-  // run after calling _exit(). On such systems, write_fd_ will be
-  // indirectly closed from the destructor of UnitTestImpl, causing double
-  // close if it is also closed here. On debug configurations, double close
-  // may assert. As there are no in-process buffers to flush here, we are
-  // relying on the OS to close the descriptor after the process terminates
-  // when the destructors are not run.
-  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
-}
-
-// Returns an indented copy of stderr output for a death test.
-// This makes distinguishing death test output lines from regular log lines
-// much easier.
-static ::std::string FormatDeathTestOutput(const ::std::string& output) {
-  ::std::string ret;
-  for (size_t at = 0; ; ) {
-    const size_t line_end = output.find('\n', at);
-    ret += "[  DEATH   ] ";
-    if (line_end == ::std::string::npos) {
-      ret += output.substr(at);
-      break;
-    }
-    ret += output.substr(at, line_end + 1 - at);
-    at = line_end + 1;
-  }
-  return ret;
-}
-
-// Assesses the success or failure of a death test, using both private
-// members which have previously been set, and one argument:
-//
-// Private data members:
-//   outcome:  An enumeration describing how the death test
-//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
-//             fails in the latter three cases.
-//   status:   The exit status of the child process. On *nix, it is in the
-//             in the format specified by wait(2). On Windows, this is the
-//             value supplied to the ExitProcess() API or a numeric code
-//             of the exception that terminated the program.
-//   regex:    A regular expression object to be applied to
-//             the test's captured standard error output; the death test
-//             fails if it does not match.
-//
-// Argument:
-//   status_ok: true if exit_status is acceptable in the context of
-//              this particular death test, which fails if it is false
-//
-// Returns true iff all of the above conditions are met.  Otherwise, the
-// first failing condition, in the order given above, is the one that is
-// reported. Also sets the last death test message string.
-bool DeathTestImpl::Passed(bool status_ok) {
-  if (!spawned())
-    return false;
-
-  const std::string error_message = GetCapturedStderr();
-
-  bool success = false;
-  Message buffer;
-
-  buffer << "Death test: " << statement() << "\n";
-  switch (outcome()) {
-    case LIVED:
-      buffer << "    Result: failed to die.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case THREW:
-      buffer << "    Result: threw an exception.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case RETURNED:
-      buffer << "    Result: illegal return in test statement.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case DIED:
-      if (status_ok) {
-        const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
-        if (matched) {
-          success = true;
-        } else {
-          buffer << "    Result: died but not with expected error.\n"
-                 << "  Expected: " << regex()->pattern() << "\n"
-                 << "Actual msg:\n" << FormatDeathTestOutput(error_message);
-        }
-      } else {
-        buffer << "    Result: died but not with expected exit code:\n"
-               << "            " << ExitSummary(status()) << "\n"
-               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
-      }
-      break;
-    case IN_PROGRESS:
-    default:
-      GTEST_LOG_(FATAL)
-          << "DeathTest::Passed somehow called before conclusion of test";
-  }
-
-  DeathTest::set_last_death_test_message(buffer.GetString());
-  return success;
-}
-
-# if GTEST_OS_WINDOWS
-// WindowsDeathTest implements death tests on Windows. Due to the
-// specifics of starting new processes on Windows, death tests there are
-// always threadsafe, and Google Test considers the
-// --gtest_death_test_style=fast setting to be equivalent to
-// --gtest_death_test_style=threadsafe there.
-//
-// A few implementation notes:  Like the Linux version, the Windows
-// implementation uses pipes for child-to-parent communication. But due to
-// the specifics of pipes on Windows, some extra steps are required:
-//
-// 1. The parent creates a communication pipe and stores handles to both
-//    ends of it.
-// 2. The parent starts the child and provides it with the information
-//    necessary to acquire the handle to the write end of the pipe.
-// 3. The child acquires the write end of the pipe and signals the parent
-//    using a Windows event.
-// 4. Now the parent can release the write end of the pipe on its side. If
-//    this is done before step 3, the object's reference count goes down to
-//    0 and it is destroyed, preventing the child from acquiring it. The
-//    parent now has to release it, or read operations on the read end of
-//    the pipe will not return when the child terminates.
-// 5. The parent reads child's output through the pipe (outcome code and
-//    any possible error messages) from the pipe, and its stderr and then
-//    determines whether to fail the test.
-//
-// Note: to distinguish Win32 API calls from the local method and function
-// calls, the former are explicitly resolved in the global namespace.
-//
-class WindowsDeathTest : public DeathTestImpl {
- public:
-  WindowsDeathTest(const char* a_statement,
-                   const RE* a_regex,
-                   const char* file,
-                   int line)
-      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
-
-  // All of these virtual functions are inherited from DeathTest.
-  virtual int Wait();
-  virtual TestRole AssumeRole();
-
- private:
-  // The name of the file in which the death test is located.
-  const char* const file_;
-  // The line number on which the death test is located.
-  const int line_;
-  // Handle to the write end of the pipe to the child process.
-  AutoHandle write_handle_;
-  // Child process handle.
-  AutoHandle child_handle_;
-  // Event the child process uses to signal the parent that it has
-  // acquired the handle to the write end of the pipe. After seeing this
-  // event the parent can release its own handles to make sure its
-  // ReadFile() calls return when the child terminates.
-  AutoHandle event_handle_;
-};
-
-// Waits for the child in a death test to exit, returning its exit
-// status, or 0 if no child process exists.  As a side effect, sets the
-// outcome data member.
-int WindowsDeathTest::Wait() {
-  if (!spawned())
-    return 0;
-
-  // Wait until the child either signals that it has acquired the write end
-  // of the pipe or it dies.
-  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
-  switch (::WaitForMultipleObjects(2,
-                                   wait_handles,
-                                   FALSE,  // Waits for any of the handles.
-                                   INFINITE)) {
-    case WAIT_OBJECT_0:
-    case WAIT_OBJECT_0 + 1:
-      break;
-    default:
-      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
-  }
-
-  // The child has acquired the write end of the pipe or exited.
-  // We release the handle on our side and continue.
-  write_handle_.Reset();
-  event_handle_.Reset();
-
-  ReadAndInterpretStatusByte();
-
-  // Waits for the child process to exit if it haven't already. This
-  // returns immediately if the child has already exited, regardless of
-  // whether previous calls to WaitForMultipleObjects synchronized on this
-  // handle or not.
-  GTEST_DEATH_TEST_CHECK_(
-      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
-                                             INFINITE));
-  DWORD status_code;
-  GTEST_DEATH_TEST_CHECK_(
-      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
-  child_handle_.Reset();
-  set_status(static_cast<int>(status_code));
-  return status();
-}
-
-// The AssumeRole process for a Windows death test.  It creates a child
-// process with the same executable as the current process to run the
-// death test.  The child process is given the --gtest_filter and
-// --gtest_internal_run_death_test flags such that it knows to run the
-// current death test only.
-DeathTest::TestRole WindowsDeathTest::AssumeRole() {
-  const UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const TestInfo* const info = impl->current_test_info();
-  const int death_test_index = info->result()->death_test_count();
-
-  if (flag != NULL) {
-    // ParseInternalRunDeathTestFlag() has performed all the necessary
-    // processing.
-    set_write_fd(flag->write_fd());
-    return EXECUTE_TEST;
-  }
-
-  // WindowsDeathTest uses an anonymous pipe to communicate results of
-  // a death test.
-  SECURITY_ATTRIBUTES handles_are_inheritable = {
-    sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
-  HANDLE read_handle, write_handle;
-  GTEST_DEATH_TEST_CHECK_(
-      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
-                   0)  // Default buffer size.
-      != FALSE);
-  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
-                                O_RDONLY));
-  write_handle_.Reset(write_handle);
-  event_handle_.Reset(::CreateEvent(
-      &handles_are_inheritable,
-      TRUE,    // The event will automatically reset to non-signaled state.
-      FALSE,   // The initial state is non-signalled.
-      NULL));  // The even is unnamed.
-  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL);
-  const std::string filter_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" +
-      info->test_case_name() + "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
-      "=" + file_ + "|" + StreamableToString(line_) + "|" +
-      StreamableToString(death_test_index) + "|" +
-      StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
-      // size_t has the same width as pointers on both 32-bit and 64-bit
-      // Windows platforms.
-      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
-      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
-      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
-
-  char executable_path[_MAX_PATH + 1];  // NOLINT
-  GTEST_DEATH_TEST_CHECK_(
-      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
-                                            executable_path,
-                                            _MAX_PATH));
-
-  std::string command_line =
-      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
-      internal_flag + "\"";
-
-  DeathTest::set_last_death_test_message("");
-
-  CaptureStderr();
-  // Flush the log buffers since the log streams are shared with the child.
-  FlushInfoLog();
-
-  // The child process will share the standard handles with the parent.
-  STARTUPINFOA startup_info;
-  memset(&startup_info, 0, sizeof(STARTUPINFO));
-  startup_info.dwFlags = STARTF_USESTDHANDLES;
-  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
-  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
-  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
-
-  PROCESS_INFORMATION process_info;
-  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
-      executable_path,
-      const_cast<char*>(command_line.c_str()),
-      NULL,   // Retuned process handle is not inheritable.
-      NULL,   // Retuned thread handle is not inheritable.
-      TRUE,   // Child inherits all inheritable handles (for write_handle_).
-      0x0,    // Default creation flags.
-      NULL,   // Inherit the parent's environment.
-      UnitTest::GetInstance()->original_working_dir(),
-      &startup_info,
-      &process_info) != FALSE);
-  child_handle_.Reset(process_info.hProcess);
-  ::CloseHandle(process_info.hThread);
-  set_spawned(true);
-  return OVERSEE_TEST;
-}
-# else  // We are not on Windows.
-
-// ForkingDeathTest provides implementations for most of the abstract
-// methods of the DeathTest interface.  Only the AssumeRole method is
-// left undefined.
-class ForkingDeathTest : public DeathTestImpl {
- public:
-  ForkingDeathTest(const char* statement, const RE* regex);
-
-  // All of these virtual functions are inherited from DeathTest.
-  virtual int Wait();
-
- protected:
-  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
-
- private:
-  // PID of child process during death test; 0 in the child process itself.
-  pid_t child_pid_;
-};
-
-// Constructs a ForkingDeathTest.
-ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex)
-    : DeathTestImpl(a_statement, a_regex),
-      child_pid_(-1) {}
-
-// Waits for the child in a death test to exit, returning its exit
-// status, or 0 if no child process exists.  As a side effect, sets the
-// outcome data member.
-int ForkingDeathTest::Wait() {
-  if (!spawned())
-    return 0;
-
-  ReadAndInterpretStatusByte();
-
-  int status_value;
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
-  set_status(status_value);
-  return status_value;
-}
-
-// A concrete death test class that forks, then immediately runs the test
-// in the child process.
-class NoExecDeathTest : public ForkingDeathTest {
- public:
-  NoExecDeathTest(const char* a_statement, const RE* a_regex) :
-      ForkingDeathTest(a_statement, a_regex) { }
-  virtual TestRole AssumeRole();
-};
-
-// The AssumeRole process for a fork-and-run death test.  It implements a
-// straightforward fork, with a simple pipe to transmit the status byte.
-DeathTest::TestRole NoExecDeathTest::AssumeRole() {
-  const size_t thread_count = GetThreadCount();
-  if (thread_count != 1) {
-    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
-  }
-
-  int pipe_fd[2];
-  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
-
-  DeathTest::set_last_death_test_message("");
-  CaptureStderr();
-  // When we fork the process below, the log file buffers are copied, but the
-  // file descriptors are shared.  We flush all log files here so that closing
-  // the file descriptors in the child process doesn't throw off the
-  // synchronization between descriptors and buffers in the parent process.
-  // This is as close to the fork as possible to avoid a race condition in case
-  // there are multiple threads running before the death test, and another
-  // thread writes to the log file.
-  FlushInfoLog();
-
-  const pid_t child_pid = fork();
-  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
-  set_child_pid(child_pid);
-  if (child_pid == 0) {
-    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
-    set_write_fd(pipe_fd[1]);
-    // Redirects all logging to stderr in the child process to prevent
-    // concurrent writes to the log files.  We capture stderr in the parent
-    // process and append the child process' output to a log.
-    LogToStderr();
-    // Event forwarding to the listeners of event listener API mush be shut
-    // down in death test subprocesses.
-    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
-    g_in_fast_death_test_child = true;
-    return EXECUTE_TEST;
-  } else {
-    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
-    set_read_fd(pipe_fd[0]);
-    set_spawned(true);
-    return OVERSEE_TEST;
-  }
-}
-
-// A concrete death test class that forks and re-executes the main
-// program from the beginning, with command-line flags set that cause
-// only this specific death test to be run.
-class ExecDeathTest : public ForkingDeathTest {
- public:
-  ExecDeathTest(const char* a_statement, const RE* a_regex,
-                const char* file, int line) :
-      ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
-  virtual TestRole AssumeRole();
- private:
-  static ::std::vector<testing::internal::string>
-  GetArgvsForDeathTestChildProcess() {
-    ::std::vector<testing::internal::string> args = GetInjectableArgvs();
-    return args;
-  }
-  // The name of the file in which the death test is located.
-  const char* const file_;
-  // The line number on which the death test is located.
-  const int line_;
-};
-
-// Utility class for accumulating command-line arguments.
-class Arguments {
- public:
-  Arguments() {
-    args_.push_back(NULL);
-  }
-
-  ~Arguments() {
-    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
-         ++i) {
-      free(*i);
-    }
-  }
-  void AddArgument(const char* argument) {
-    args_.insert(args_.end() - 1, posix::StrDup(argument));
-  }
-
-  template <typename Str>
-  void AddArguments(const ::std::vector<Str>& arguments) {
-    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
-      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
-    }
-  }
-  char* const* Argv() {
-    return &args_[0];
-  }
-
- private:
-  std::vector<char*> args_;
-};
-
-// A struct that encompasses the arguments to the child process of a
-// threadsafe-style death test process.
-struct ExecDeathTestArgs {
-  char* const* argv;  // Command-line arguments for the child's call to exec
-  int close_fd;       // File descriptor to close; the read end of a pipe
-};
-
-#  if GTEST_OS_MAC
-inline char** GetEnviron() {
-  // When Google Test is built as a framework on MacOS X, the environ variable
-  // is unavailable. Apple's documentation (man environ) recommends using
-  // _NSGetEnviron() instead.
-  return *_NSGetEnviron();
-}
-#  else
-// Some POSIX platforms expect you to declare environ. extern "C" makes
-// it reside in the global namespace.
-extern "C" char** environ;
-inline char** GetEnviron() { return environ; }
-#  endif  // GTEST_OS_MAC
-
-#  if !GTEST_OS_QNX
-// The main function for a threadsafe-style death test child process.
-// This function is called in a clone()-ed process and thus must avoid
-// any potentially unsafe operations like malloc or libc functions.
-static int ExecDeathTestChildMain(void* child_arg) {
-  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
-
-  // We need to execute the test program in the same environment where
-  // it was originally invoked.  Therefore we change to the original
-  // working directory first.
-  const char* const original_dir =
-      UnitTest::GetInstance()->original_working_dir();
-  // We can safely call chdir() as it's a direct system call.
-  if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
-    return EXIT_FAILURE;
-  }
-
-  // We can safely call execve() as it's a direct system call.  We
-  // cannot use execvp() as it's a libc function and thus potentially
-  // unsafe.  Since execve() doesn't search the PATH, the user must
-  // invoke the test program via a valid path that contains at least
-  // one path separator.
-  execve(args->argv[0], args->argv, GetEnviron());
-  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
-                 original_dir + " failed: " +
-                 GetLastErrnoDescription());
-  return EXIT_FAILURE;
-}
-#  endif  // !GTEST_OS_QNX
-
-// Two utility routines that together determine the direction the stack
-// grows.
-// This could be accomplished more elegantly by a single recursive
-// function, but we want to guard against the unlikely possibility of
-// a smart compiler optimizing the recursion away.
-//
-// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
-// StackLowerThanAddress into StackGrowsDown, which then doesn't give
-// correct answer.
-void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_;
-void StackLowerThanAddress(const void* ptr, bool* result) {
-  int dummy;
-  *result = (&dummy < ptr);
-}
-
-bool StackGrowsDown() {
-  int dummy;
-  bool result;
-  StackLowerThanAddress(&dummy, &result);
-  return result;
-}
-
-// Spawns a child process with the same executable as the current process in
-// a thread-safe manner and instructs it to run the death test.  The
-// implementation uses fork(2) + exec.  On systems where clone(2) is
-// available, it is used instead, being slightly more thread-safe.  On QNX,
-// fork supports only single-threaded environments, so this function uses
-// spawn(2) there instead.  The function dies with an error message if
-// anything goes wrong.
-static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
-  ExecDeathTestArgs args = { argv, close_fd };
-  pid_t child_pid = -1;
-
-#  if GTEST_OS_QNX
-  // Obtains the current directory and sets it to be closed in the child
-  // process.
-  const int cwd_fd = open(".", O_RDONLY);
-  GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
-  // We need to execute the test program in the same environment where
-  // it was originally invoked.  Therefore we change to the original
-  // working directory first.
-  const char* const original_dir =
-      UnitTest::GetInstance()->original_working_dir();
-  // We can safely call chdir() as it's a direct system call.
-  if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
-    return EXIT_FAILURE;
-  }
-
-  int fd_flags;
-  // Set close_fd to be closed after spawn.
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
-                                        fd_flags | FD_CLOEXEC));
-  struct inheritance inherit = {0};
-  // spawn is a system call.
-  child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron());
-  // Restores the current working directory.
-  GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
-
-#  else   // GTEST_OS_QNX
-#   if GTEST_OS_LINUX
-  // When a SIGPROF signal is received while fork() or clone() are executing,
-  // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
-  // it after the call to fork()/clone() is complete.
-  struct sigaction saved_sigprof_action;
-  struct sigaction ignore_sigprof_action;
-  memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
-  sigemptyset(&ignore_sigprof_action.sa_mask);
-  ignore_sigprof_action.sa_handler = SIG_IGN;
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
-      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
-#   endif  // GTEST_OS_LINUX
-
-#   if GTEST_HAS_CLONE
-  const bool use_fork = GTEST_FLAG(death_test_use_fork);
-
-  if (!use_fork) {
-    static const bool stack_grows_down = StackGrowsDown();
-    const size_t stack_size = getpagesize();
-    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
-    void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
-                             MAP_ANON | MAP_PRIVATE, -1, 0);
-    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
-
-    // Maximum stack alignment in bytes:  For a downward-growing stack, this
-    // amount is subtracted from size of the stack space to get an address
-    // that is within the stack space and is aligned on all systems we care
-    // about.  As far as I know there is no ABI with stack alignment greater
-    // than 64.  We assume stack and stack_size already have alignment of
-    // kMaxStackAlignment.
-    const size_t kMaxStackAlignment = 64;
-    void* const stack_top =
-        static_cast<char*>(stack) +
-            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
-    GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment &&
-        reinterpret_cast<intptr_t>(stack_top) % kMaxStackAlignment == 0);
-
-    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
-
-    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
-  }
-#   else
-  const bool use_fork = true;
-#   endif  // GTEST_HAS_CLONE
-
-  if (use_fork && (child_pid = fork()) == 0) {
-      ExecDeathTestChildMain(&args);
-      _exit(0);
-  }
-#  endif  // GTEST_OS_QNX
-#  if GTEST_OS_LINUX
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(
-      sigaction(SIGPROF, &saved_sigprof_action, NULL));
-#  endif  // GTEST_OS_LINUX
-
-  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
-  return child_pid;
-}
-
-// The AssumeRole process for a fork-and-exec death test.  It re-executes the
-// main program from the beginning, setting the --gtest_filter
-// and --gtest_internal_run_death_test flags to cause only the current
-// death test to be re-run.
-DeathTest::TestRole ExecDeathTest::AssumeRole() {
-  const UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const TestInfo* const info = impl->current_test_info();
-  const int death_test_index = info->result()->death_test_count();
-
-  if (flag != NULL) {
-    set_write_fd(flag->write_fd());
-    return EXECUTE_TEST;
-  }
-
-  int pipe_fd[2];
-  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
-  // Clear the close-on-exec flag on the write end of the pipe, lest
-  // it be closed when the child process does an exec:
-  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
-
-  const std::string filter_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
-      + info->test_case_name() + "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
-      + file_ + "|" + StreamableToString(line_) + "|"
-      + StreamableToString(death_test_index) + "|"
-      + StreamableToString(pipe_fd[1]);
-  Arguments args;
-  args.AddArguments(GetArgvsForDeathTestChildProcess());
-  args.AddArgument(filter_flag.c_str());
-  args.AddArgument(internal_flag.c_str());
-
-  DeathTest::set_last_death_test_message("");
-
-  CaptureStderr();
-  // See the comment in NoExecDeathTest::AssumeRole for why the next line
-  // is necessary.
-  FlushInfoLog();
-
-  const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
-  set_child_pid(child_pid);
-  set_read_fd(pipe_fd[0]);
-  set_spawned(true);
-  return OVERSEE_TEST;
-}
-
-# endif  // !GTEST_OS_WINDOWS
-
-// Creates a concrete DeathTest-derived class that depends on the
-// --gtest_death_test_style flag, and sets the pointer pointed to
-// by the "test" argument to its address.  If the test should be
-// skipped, sets that pointer to NULL.  Returns true, unless the
-// flag is set to an invalid value.
-bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
-                                     const char* file, int line,
-                                     DeathTest** test) {
-  UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const int death_test_index = impl->current_test_info()
-      ->increment_death_test_count();
-
-  if (flag != NULL) {
-    if (death_test_index > flag->index()) {
-      DeathTest::set_last_death_test_message(
-          "Death test count (" + StreamableToString(death_test_index)
-          + ") somehow exceeded expected maximum ("
-          + StreamableToString(flag->index()) + ")");
-      return false;
-    }
-
-    if (!(flag->file() == file && flag->line() == line &&
-          flag->index() == death_test_index)) {
-      *test = NULL;
-      return true;
-    }
-  }
-
-# if GTEST_OS_WINDOWS
-
-  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
-      GTEST_FLAG(death_test_style) == "fast") {
-    *test = new WindowsDeathTest(statement, regex, file, line);
-  }
-
-# else
-
-  if (GTEST_FLAG(death_test_style) == "threadsafe") {
-    *test = new ExecDeathTest(statement, regex, file, line);
-  } else if (GTEST_FLAG(death_test_style) == "fast") {
-    *test = new NoExecDeathTest(statement, regex);
-  }
-
-# endif  // GTEST_OS_WINDOWS
-
-  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
-    DeathTest::set_last_death_test_message(
-        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
-        + "\" encountered");
-    return false;
-  }
-
-  return true;
-}
-
-// Splits a given string on a given delimiter, populating a given
-// vector with the fields.  GTEST_HAS_DEATH_TEST implies that we have
-// ::std::string, so we can use it here.
-static void SplitString(const ::std::string& str, char delimiter,
-                        ::std::vector< ::std::string>* dest) {
-  ::std::vector< ::std::string> parsed;
-  ::std::string::size_type pos = 0;
-  while (::testing::internal::AlwaysTrue()) {
-    const ::std::string::size_type colon = str.find(delimiter, pos);
-    if (colon == ::std::string::npos) {
-      parsed.push_back(str.substr(pos));
-      break;
-    } else {
-      parsed.push_back(str.substr(pos, colon - pos));
-      pos = colon + 1;
-    }
-  }
-  dest->swap(parsed);
-}
-
-# if GTEST_OS_WINDOWS
-// Recreates the pipe and event handles from the provided parameters,
-// signals the event, and returns a file descriptor wrapped around the pipe
-// handle. This function is called in the child process only.
-int GetStatusFileDescriptor(unsigned int parent_process_id,
-                            size_t write_handle_as_size_t,
-                            size_t event_handle_as_size_t) {
-  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
-                                                   FALSE,  // Non-inheritable.
-                                                   parent_process_id));
-  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
-    DeathTestAbort("Unable to open parent process " +
-                   StreamableToString(parent_process_id));
-  }
-
-  // TODO(vladl@google.com): Replace the following check with a
-  // compile-time assertion when available.
-  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
-
-  const HANDLE write_handle =
-      reinterpret_cast<HANDLE>(write_handle_as_size_t);
-  HANDLE dup_write_handle;
-
-  // The newly initialized handle is accessible only in in the parent
-  // process. To obtain one accessible within the child, we need to use
-  // DuplicateHandle.
-  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
-                         ::GetCurrentProcess(), &dup_write_handle,
-                         0x0,    // Requested privileges ignored since
-                                 // DUPLICATE_SAME_ACCESS is used.
-                         FALSE,  // Request non-inheritable handler.
-                         DUPLICATE_SAME_ACCESS)) {
-    DeathTestAbort("Unable to duplicate the pipe handle " +
-                   StreamableToString(write_handle_as_size_t) +
-                   " from the parent process " +
-                   StreamableToString(parent_process_id));
-  }
-
-  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
-  HANDLE dup_event_handle;
-
-  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
-                         ::GetCurrentProcess(), &dup_event_handle,
-                         0x0,
-                         FALSE,
-                         DUPLICATE_SAME_ACCESS)) {
-    DeathTestAbort("Unable to duplicate the event handle " +
-                   StreamableToString(event_handle_as_size_t) +
-                   " from the parent process " +
-                   StreamableToString(parent_process_id));
-  }
-
-  const int write_fd =
-      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
-  if (write_fd == -1) {
-    DeathTestAbort("Unable to convert pipe handle " +
-                   StreamableToString(write_handle_as_size_t) +
-                   " to a file descriptor");
-  }
-
-  // Signals the parent that the write end of the pipe has been acquired
-  // so the parent can release its own write end.
-  ::SetEvent(dup_event_handle);
-
-  return write_fd;
-}
-# endif  // GTEST_OS_WINDOWS
-
-// Returns a newly created InternalRunDeathTestFlag object with fields
-// initialized from the GTEST_FLAG(internal_run_death_test) flag if
-// the flag is specified; otherwise returns NULL.
-InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
-  if (GTEST_FLAG(internal_run_death_test) == "") return NULL;
-
-  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
-  // can use it here.
-  int line = -1;
-  int index = -1;
-  ::std::vector< ::std::string> fields;
-  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
-  int write_fd = -1;
-
-# if GTEST_OS_WINDOWS
-
-  unsigned int parent_process_id = 0;
-  size_t write_handle_as_size_t = 0;
-  size_t event_handle_as_size_t = 0;
-
-  if (fields.size() != 6
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &parent_process_id)
-      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
-      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
-                   GTEST_FLAG(internal_run_death_test));
-  }
-  write_fd = GetStatusFileDescriptor(parent_process_id,
-                                     write_handle_as_size_t,
-                                     event_handle_as_size_t);
-# else
-
-  if (fields.size() != 4
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &write_fd)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
-        + GTEST_FLAG(internal_run_death_test));
-  }
-
-# endif  // GTEST_OS_WINDOWS
-
-  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
-}
-
-}  // namespace internal
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-}  // namespace testing
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: keith.ray@gmail.com (Keith Ray)
-
-
-#include <stdlib.h>
-
-#if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>
-#elif GTEST_OS_WINDOWS
-# include <direct.h>
-# include <io.h>
-#elif GTEST_OS_SYMBIAN
-// Symbian OpenC has PATH_MAX in sys/syslimits.h
-# include <sys/syslimits.h>
-#else
-# include <limits.h>
-# include <climits>  // Some Linux distributions define PATH_MAX here.
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-#if GTEST_OS_WINDOWS
-# define GTEST_PATH_MAX_ _MAX_PATH
-#elif defined(PATH_MAX)
-# define GTEST_PATH_MAX_ PATH_MAX
-#elif defined(_XOPEN_PATH_MAX)
-# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
-#else
-# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
-#endif  // GTEST_OS_WINDOWS
-
-
-namespace testing {
-namespace internal {
-
-#if GTEST_OS_WINDOWS
-// On Windows, '\\' is the standard path separator, but many tools and the
-// Windows API also accept '/' as an alternate path separator. Unless otherwise
-// noted, a file path can contain either kind of path separators, or a mixture
-// of them.
-const char kPathSeparator = '\\';
-const char kAlternatePathSeparator = '/';
-const char kAlternatePathSeparatorString[] = "/";
-# if GTEST_OS_WINDOWS_MOBILE
-// Windows CE doesn't have a current directory. You should not use
-// the current directory in tests on Windows CE, but this at least
-// provides a reasonable fallback.
-const char kCurrentDirectoryString[] = "\\";
-// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
-const DWORD kInvalidFileAttributes = 0xffffffff;
-# else
-const char kCurrentDirectoryString[] = ".\\";
-# endif  // GTEST_OS_WINDOWS_MOBILE
-#else
-const char kPathSeparator = '/';
-const char kCurrentDirectoryString[] = "./";
-#endif  // GTEST_OS_WINDOWS
-
-// Returns whether the given character is a valid path separator.
-static bool IsPathSeparator(char c) {
-#if GTEST_HAS_ALT_PATH_SEP_
-  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
-#else
-  return c == kPathSeparator;
-#endif
-}
-
-// Returns the current working directory, or "" if unsuccessful.
-FilePath FilePath::GetCurrentDir() {
-#if GTEST_OS_WINDOWS_MOBILE
-  // Windows CE doesn't have a current directory, so we just return
-  // something reasonable.
-  return FilePath(kCurrentDirectoryString);
-#elif GTEST_OS_WINDOWS
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
-  return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
-#else
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
-  return FilePath(getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
-#endif  // GTEST_OS_WINDOWS_MOBILE
-}
-
-// Returns a copy of the FilePath with the case-insensitive extension removed.
-// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
-// FilePath("dir/file"). If a case-insensitive extension is not
-// found, returns a copy of the original FilePath.
-FilePath FilePath::RemoveExtension(const char* extension) const {
-  const std::string dot_extension = std::string(".") + extension;
-  if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
-    return FilePath(pathname_.substr(
-        0, pathname_.length() - dot_extension.length()));
-  }
-  return *this;
-}
-
-// Returns a pointer to the last occurence of a valid path separator in
-// the FilePath. On Windows, for example, both '/' and '\' are valid path
-// separators. Returns NULL if no path separator was found.
-const char* FilePath::FindLastPathSeparator() const {
-  const char* const last_sep = strrchr(c_str(), kPathSeparator);
-#if GTEST_HAS_ALT_PATH_SEP_
-  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
-  // Comparing two pointers of which only one is NULL is undefined.
-  if (last_alt_sep != NULL &&
-      (last_sep == NULL || last_alt_sep > last_sep)) {
-    return last_alt_sep;
-  }
-#endif
-  return last_sep;
-}
-
-// Returns a copy of the FilePath with the directory part removed.
-// Example: FilePath("path/to/file").RemoveDirectoryName() returns
-// FilePath("file"). If there is no directory part ("just_a_file"), it returns
-// the FilePath unmodified. If there is no file part ("just_a_dir/") it
-// returns an empty FilePath ("").
-// On Windows platform, '\' is the path separator, otherwise it is '/'.
-FilePath FilePath::RemoveDirectoryName() const {
-  const char* const last_sep = FindLastPathSeparator();
-  return last_sep ? FilePath(last_sep + 1) : *this;
-}
-
-// RemoveFileName returns the directory path with the filename removed.
-// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
-// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
-// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
-// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
-// On Windows platform, '\' is the path separator, otherwise it is '/'.
-FilePath FilePath::RemoveFileName() const {
-  const char* const last_sep = FindLastPathSeparator();
-  std::string dir;
-  if (last_sep) {
-    dir = std::string(c_str(), last_sep + 1 - c_str());
-  } else {
-    dir = kCurrentDirectoryString;
-  }
-  return FilePath(dir);
-}
-
-// Helper functions for naming files in a directory for xml output.
-
-// Given directory = "dir", base_name = "test", number = 0,
-// extension = "xml", returns "dir/test.xml". If number is greater
-// than zero (e.g., 12), returns "dir/test_12.xml".
-// On Windows platform, uses \ as the separator rather than /.
-FilePath FilePath::MakeFileName(const FilePath& directory,
-                                const FilePath& base_name,
-                                int number,
-                                const char* extension) {
-  std::string file;
-  if (number == 0) {
-    file = base_name.string() + "." + extension;
-  } else {
-    file = base_name.string() + "_" + StreamableToString(number)
-        + "." + extension;
-  }
-  return ConcatPaths(directory, FilePath(file));
-}
-
-// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
-// On Windows, uses \ as the separator rather than /.
-FilePath FilePath::ConcatPaths(const FilePath& directory,
-                               const FilePath& relative_path) {
-  if (directory.IsEmpty())
-    return relative_path;
-  const FilePath dir(directory.RemoveTrailingPathSeparator());
-  return FilePath(dir.string() + kPathSeparator + relative_path.string());
-}
-
-// Returns true if pathname describes something findable in the file-system,
-// either a file, directory, or whatever.
-bool FilePath::FileOrDirectoryExists() const {
-#if GTEST_OS_WINDOWS_MOBILE
-  LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
-  const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
-  return attributes != kInvalidFileAttributes;
-#else
-  posix::StatStruct file_stat;
-  return posix::Stat(pathname_.c_str(), &file_stat) == 0;
-#endif  // GTEST_OS_WINDOWS_MOBILE
-}
-
-// Returns true if pathname describes a directory in the file-system
-// that exists.
-bool FilePath::DirectoryExists() const {
-  bool result = false;
-#if GTEST_OS_WINDOWS
-  // Don't strip off trailing separator if path is a root directory on
-  // Windows (like "C:\\").
-  const FilePath& path(IsRootDirectory() ? *this :
-                                           RemoveTrailingPathSeparator());
-#else
-  const FilePath& path(*this);
-#endif
-
-#if GTEST_OS_WINDOWS_MOBILE
-  LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
-  const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
-  if ((attributes != kInvalidFileAttributes) &&
-      (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
-    result = true;
-  }
-#else
-  posix::StatStruct file_stat;
-  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
-      posix::IsDir(file_stat);
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-  return result;
-}
-
-// Returns true if pathname describes a root directory. (Windows has one
-// root directory per disk drive.)
-bool FilePath::IsRootDirectory() const {
-#if GTEST_OS_WINDOWS
-  // TODO(wan@google.com): on Windows a network share like
-  // \\server\share can be a root directory, although it cannot be the
-  // current directory.  Handle this properly.
-  return pathname_.length() == 3 && IsAbsolutePath();
-#else
-  return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
-#endif
-}
-
-// Returns true if pathname describes an absolute path.
-bool FilePath::IsAbsolutePath() const {
-  const char* const name = pathname_.c_str();
-#if GTEST_OS_WINDOWS
-  return pathname_.length() >= 3 &&
-     ((name[0] >= 'a' && name[0] <= 'z') ||
-      (name[0] >= 'A' && name[0] <= 'Z')) &&
-     name[1] == ':' &&
-     IsPathSeparator(name[2]);
-#else
-  return IsPathSeparator(name[0]);
-#endif
-}
-
-// Returns a pathname for a file that does not currently exist. The pathname
-// will be directory/base_name.extension or
-// directory/base_name_<number>.extension if directory/base_name.extension
-// already exists. The number will be incremented until a pathname is found
-// that does not already exist.
-// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
-// There could be a race condition if two or more processes are calling this
-// function at the same time -- they could both pick the same filename.
-FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
-                                          const FilePath& base_name,
-                                          const char* extension) {
-  FilePath full_pathname;
-  int number = 0;
-  do {
-    full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
-  } while (full_pathname.FileOrDirectoryExists());
-  return full_pathname;
-}
-
-// Returns true if FilePath ends with a path separator, which indicates that
-// it is intended to represent a directory. Returns false otherwise.
-// This does NOT check that a directory (or file) actually exists.
-bool FilePath::IsDirectory() const {
-  return !pathname_.empty() &&
-         IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
-}
-
-// Create directories so that path exists. Returns true if successful or if
-// the directories already exist; returns false if unable to create directories
-// for any reason.
-bool FilePath::CreateDirectoriesRecursively() const {
-  if (!this->IsDirectory()) {
-    return false;
-  }
-
-  if (pathname_.length() == 0 || this->DirectoryExists()) {
-    return true;
-  }
-
-  const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
-  return parent.CreateDirectoriesRecursively() && this->CreateFolder();
-}
-
-// Create the directory so that path exists. Returns true if successful or
-// if the directory already exists; returns false if unable to create the
-// directory for any reason, including if the parent directory does not
-// exist. Not named "CreateDirectory" because that's a macro on Windows.
-bool FilePath::CreateFolder() const {
-#if GTEST_OS_WINDOWS_MOBILE
-  FilePath removed_sep(this->RemoveTrailingPathSeparator());
-  LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
-  int result = CreateDirectory(unicode, NULL) ? 0 : -1;
-  delete [] unicode;
-#elif GTEST_OS_WINDOWS
-  int result = _mkdir(pathname_.c_str());
-#else
-  int result = mkdir(pathname_.c_str(), 0777);
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-  if (result == -1) {
-    return this->DirectoryExists();  // An error is OK if the directory exists.
-  }
-  return true;  // No error.
-}
-
-// If input name has a trailing separator character, remove it and return the
-// name, otherwise return the name string unmodified.
-// On Windows platform, uses \ as the separator, other platforms use /.
-FilePath FilePath::RemoveTrailingPathSeparator() const {
-  return IsDirectory()
-      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
-      : *this;
-}
-
-// Removes any redundant separators that might be in the pathname.
-// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
-// redundancies that might be in a pathname involving "." or "..".
-// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share).
-void FilePath::Normalize() {
-  if (pathname_.c_str() == NULL) {
-    pathname_ = "";
-    return;
-  }
-  const char* src = pathname_.c_str();
-  char* const dest = new char[pathname_.length() + 1];
-  char* dest_ptr = dest;
-  memset(dest_ptr, 0, pathname_.length() + 1);
-
-  while (*src != '\0') {
-    *dest_ptr = *src;
-    if (!IsPathSeparator(*src)) {
-      src++;
-    } else {
-#if GTEST_HAS_ALT_PATH_SEP_
-      if (*dest_ptr == kAlternatePathSeparator) {
-        *dest_ptr = kPathSeparator;
-      }
-#endif
-      while (IsPathSeparator(*src))
-        src++;
-    }
-    dest_ptr++;
-  }
-  *dest_ptr = '\0';
-  pathname_ = dest;
-  delete[] dest;
-}
-
-}  // namespace internal
-}  // namespace testing
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-
-#include <limits.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>  // For TerminateProcess()
-#elif GTEST_OS_WINDOWS
-# include <io.h>
-# include <sys/stat.h>
-#else
-# include <unistd.h>
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-#if GTEST_OS_MAC
-# include <mach/mach_init.h>
-# include <mach/task.h>
-# include <mach/vm_map.h>
-#endif  // GTEST_OS_MAC
-
-#if GTEST_OS_QNX
-# include <devctl.h>
-# include <sys/procfs.h>
-#endif  // GTEST_OS_QNX
-
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
-
-namespace testing {
-namespace internal {
-
-#if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
-const int kStdOutFileno = 1;
-const int kStdErrFileno = 2;
-#else
-const int kStdOutFileno = STDOUT_FILENO;
-const int kStdErrFileno = STDERR_FILENO;
-#endif  // _MSC_VER
-
-#if GTEST_OS_MAC
-
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
-size_t GetThreadCount() {
-  const task_t task = mach_task_self();
-  mach_msg_type_number_t thread_count;
-  thread_act_array_t thread_list;
-  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
-  if (status == KERN_SUCCESS) {
-    // task_threads allocates resources in thread_list and we need to free them
-    // to avoid leaks.
-    vm_deallocate(task,
-                  reinterpret_cast<vm_address_t>(thread_list),
-                  sizeof(thread_t) * thread_count);
-    return static_cast<size_t>(thread_count);
-  } else {
-    return 0;
-  }
-}
-
-#elif GTEST_OS_QNX
-
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
-size_t GetThreadCount() {
-  const int fd = open("/proc/self/as", O_RDONLY);
-  if (fd < 0) {
-    return 0;
-  }
-  procfs_info process_info;
-  const int status =
-      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL);
-  close(fd);
-  if (status == EOK) {
-    return static_cast<size_t>(process_info.num_threads);
-  } else {
-    return 0;
-  }
-}
-
-#else
-
-size_t GetThreadCount() {
-  // There's no portable way to detect the number of threads, so we just
-  // return 0 to indicate that we cannot detect it.
-  return 0;
-}
-
-#endif  // GTEST_OS_MAC
-
-#if GTEST_USES_POSIX_RE
-
-// Implements RE.  Currently only needed for death tests.
-
-RE::~RE() {
-  if (is_valid_) {
-    // regfree'ing an invalid regex might crash because the content
-    // of the regex is undefined. Since the regex's are essentially
-    // the same, one cannot be valid (or invalid) without the other
-    // being so too.
-    regfree(&partial_regex_);
-    regfree(&full_regex_);
-  }
-  free(const_cast<char*>(pattern_));
-}
-
-// Returns true iff regular expression re matches the entire str.
-bool RE::FullMatch(const char* str, const RE& re) {
-  if (!re.is_valid_) return false;
-
-  regmatch_t match;
-  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
-}
-
-// Returns true iff regular expression re matches a substring of str
-// (including str itself).
-bool RE::PartialMatch(const char* str, const RE& re) {
-  if (!re.is_valid_) return false;
-
-  regmatch_t match;
-  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
-}
-
-// Initializes an RE from its string representation.
-void RE::Init(const char* regex) {
-  pattern_ = posix::StrDup(regex);
-
-  // Reserves enough bytes to hold the regular expression used for a
-  // full match.
-  const size_t full_regex_len = strlen(regex) + 10;
-  char* const full_pattern = new char[full_regex_len];
-
-  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
-  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
-  // We want to call regcomp(&partial_regex_, ...) even if the
-  // previous expression returns false.  Otherwise partial_regex_ may
-  // not be properly initialized can may cause trouble when it's
-  // freed.
-  //
-  // Some implementation of POSIX regex (e.g. on at least some
-  // versions of Cygwin) doesn't accept the empty string as a valid
-  // regex.  We change it to an equivalent form "()" to be safe.
-  if (is_valid_) {
-    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
-    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
-  }
-  EXPECT_TRUE(is_valid_)
-      << "Regular expression \"" << regex
-      << "\" is not a valid POSIX Extended regular expression.";
-
-  delete[] full_pattern;
-}
-
-#elif GTEST_USES_SIMPLE_RE
-
-// Returns true iff ch appears anywhere in str (excluding the
-// terminating '\0' character).
-bool IsInSet(char ch, const char* str) {
-  return ch != '\0' && strchr(str, ch) != NULL;
-}
-
-// Returns true iff ch belongs to the given classification.  Unlike
-// similar functions in <ctype.h>, these aren't affected by the
-// current locale.
-bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
-bool IsAsciiPunct(char ch) {
-  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
-}
-bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
-bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
-bool IsAsciiWordChar(char ch) {
-  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
-      ('0' <= ch && ch <= '9') || ch == '_';
-}
-
-// Returns true iff "\\c" is a supported escape sequence.
-bool IsValidEscape(char c) {
-  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
-}
-
-// Returns true iff the given atom (specified by escaped and pattern)
-// matches ch.  The result is undefined if the atom is invalid.
-bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
-  if (escaped) {  // "\\p" where p is pattern_char.
-    switch (pattern_char) {
-      case 'd': return IsAsciiDigit(ch);
-      case 'D': return !IsAsciiDigit(ch);
-      case 'f': return ch == '\f';
-      case 'n': return ch == '\n';
-      case 'r': return ch == '\r';
-      case 's': return IsAsciiWhiteSpace(ch);
-      case 'S': return !IsAsciiWhiteSpace(ch);
-      case 't': return ch == '\t';
-      case 'v': return ch == '\v';
-      case 'w': return IsAsciiWordChar(ch);
-      case 'W': return !IsAsciiWordChar(ch);
-    }
-    return IsAsciiPunct(pattern_char) && pattern_char == ch;
-  }
-
-  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
-}
-
-// Helper function used by ValidateRegex() to format error messages.
-std::string FormatRegexSyntaxError(const char* regex, int index) {
-  return (Message() << "Syntax error at index " << index
-          << " in simple regular expression \"" << regex << "\": ").GetString();
-}
-
-// Generates non-fatal failures and returns false if regex is invalid;
-// otherwise returns true.
-bool ValidateRegex(const char* regex) {
-  if (regex == NULL) {
-    // TODO(wan@google.com): fix the source file location in the
-    // assertion failures to match where the regex is used in user
-    // code.
-    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
-    return false;
-  }
-
-  bool is_valid = true;
-
-  // True iff ?, *, or + can follow the previous atom.
-  bool prev_repeatable = false;
-  for (int i = 0; regex[i]; i++) {
-    if (regex[i] == '\\') {  // An escape sequence
-      i++;
-      if (regex[i] == '\0') {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
-                      << "'\\' cannot appear at the end.";
-        return false;
-      }
-
-      if (!IsValidEscape(regex[i])) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
-                      << "invalid escape sequence \"\\" << regex[i] << "\".";
-        is_valid = false;
-      }
-      prev_repeatable = true;
-    } else {  // Not an escape sequence.
-      const char ch = regex[i];
-
-      if (ch == '^' && i > 0) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'^' can only appear at the beginning.";
-        is_valid = false;
-      } else if (ch == '$' && regex[i + 1] != '\0') {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'$' can only appear at the end.";
-        is_valid = false;
-      } else if (IsInSet(ch, "()[]{}|")) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' is unsupported.";
-        is_valid = false;
-      } else if (IsRepeat(ch) && !prev_repeatable) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' can only follow a repeatable token.";
-        is_valid = false;
-      }
-
-      prev_repeatable = !IsInSet(ch, "^$?*+");
-    }
-  }
-
-  return is_valid;
-}
-
-// Matches a repeated regex atom followed by a valid simple regular
-// expression.  The regex atom is defined as c if escaped is false,
-// or \c otherwise.  repeat is the repetition meta character (?, *,
-// or +).  The behavior is undefined if str contains too many
-// characters to be indexable by size_t, in which case the test will
-// probably time out anyway.  We are fine with this limitation as
-// std::string has it too.
-bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char c, char repeat, const char* regex,
-    const char* str) {
-  const size_t min_count = (repeat == '+') ? 1 : 0;
-  const size_t max_count = (repeat == '?') ? 1 :
-      static_cast<size_t>(-1) - 1;
-  // We cannot call numeric_limits::max() as it conflicts with the
-  // max() macro on Windows.
-
-  for (size_t i = 0; i <= max_count; ++i) {
-    // We know that the atom matches each of the first i characters in str.
-    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
-      // We have enough matches at the head, and the tail matches too.
-      // Since we only care about *whether* the pattern matches str
-      // (as opposed to *how* it matches), there is no need to find a
-      // greedy match.
-      return true;
-    }
-    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
-      return false;
-  }
-  return false;
-}
-
-// Returns true iff regex matches a prefix of str.  regex must be a
-// valid simple regular expression and not start with "^", or the
-// result is undefined.
-bool MatchRegexAtHead(const char* regex, const char* str) {
-  if (*regex == '\0')  // An empty regex matches a prefix of anything.
-    return true;
-
-  // "$" only matches the end of a string.  Note that regex being
-  // valid guarantees that there's nothing after "$" in it.
-  if (*regex == '$')
-    return *str == '\0';
-
-  // Is the first thing in regex an escape sequence?
-  const bool escaped = *regex == '\\';
-  if (escaped)
-    ++regex;
-  if (IsRepeat(regex[1])) {
-    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
-    // here's an indirect recursion.  It terminates as the regex gets
-    // shorter in each recursion.
-    return MatchRepetitionAndRegexAtHead(
-        escaped, regex[0], regex[1], regex + 2, str);
-  } else {
-    // regex isn't empty, isn't "$", and doesn't start with a
-    // repetition.  We match the first atom of regex with the first
-    // character of str and recurse.
-    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
-        MatchRegexAtHead(regex + 1, str + 1);
-  }
-}
-
-// Returns true iff regex matches any substring of str.  regex must be
-// a valid simple regular expression, or the result is undefined.
-//
-// The algorithm is recursive, but the recursion depth doesn't exceed
-// the regex length, so we won't need to worry about running out of
-// stack space normally.  In rare cases the time complexity can be
-// exponential with respect to the regex length + the string length,
-// but usually it's must faster (often close to linear).
-bool MatchRegexAnywhere(const char* regex, const char* str) {
-  if (regex == NULL || str == NULL)
-    return false;
-
-  if (*regex == '^')
-    return MatchRegexAtHead(regex + 1, str);
-
-  // A successful match can be anywhere in str.
-  do {
-    if (MatchRegexAtHead(regex, str))
-      return true;
-  } while (*str++ != '\0');
-  return false;
-}
-
-// Implements the RE class.
-
-RE::~RE() {
-  free(const_cast<char*>(pattern_));
-  free(const_cast<char*>(full_pattern_));
-}
-
-// Returns true iff regular expression re matches the entire str.
-bool RE::FullMatch(const char* str, const RE& re) {
-  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
-}
-
-// Returns true iff regular expression re matches a substring of str
-// (including str itself).
-bool RE::PartialMatch(const char* str, const RE& re) {
-  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
-}
-
-// Initializes an RE from its string representation.
-void RE::Init(const char* regex) {
-  pattern_ = full_pattern_ = NULL;
-  if (regex != NULL) {
-    pattern_ = posix::StrDup(regex);
-  }
-
-  is_valid_ = ValidateRegex(regex);
-  if (!is_valid_) {
-    // No need to calculate the full pattern when the regex is invalid.
-    return;
-  }
-
-  const size_t len = strlen(regex);
-  // Reserves enough bytes to hold the regular expression used for a
-  // full match: we need space to prepend a '^', append a '$', and
-  // terminate the string with '\0'.
-  char* buffer = static_cast<char*>(malloc(len + 3));
-  full_pattern_ = buffer;
-
-  if (*regex != '^')
-    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
-
-  // We don't use snprintf or strncpy, as they trigger a warning when
-  // compiled with VC++ 8.0.
-  memcpy(buffer, regex, len);
-  buffer += len;
-
-  if (len == 0 || regex[len - 1] != '$')
-    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
-
-  *buffer = '\0';
-}
-
-#endif  // GTEST_USES_POSIX_RE
-
-const char kUnknownFile[] = "unknown file";
-
-// Formats a source file path and a line number as they would appear
-// in an error message from the compiler used to compile this code.
-GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
-  const std::string file_name(file == NULL ? kUnknownFile : file);
-
-  if (line < 0) {
-    return file_name + ":";
-  }
-#ifdef _MSC_VER
-  return file_name + "(" + StreamableToString(line) + "):";
-#else
-  return file_name + ":" + StreamableToString(line) + ":";
-#endif  // _MSC_VER
-}
-
-// Formats a file location for compiler-independent XML output.
-// Although this function is not platform dependent, we put it next to
-// FormatFileLocation in order to contrast the two functions.
-// Note that FormatCompilerIndependentFileLocation() does NOT append colon
-// to the file location it produces, unlike FormatFileLocation().
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
-    const char* file, int line) {
-  const std::string file_name(file == NULL ? kUnknownFile : file);
-
-  if (line < 0)
-    return file_name;
-  else
-    return file_name + ":" + StreamableToString(line);
-}
-
-
-GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
-    : severity_(severity) {
-  const char* const marker =
-      severity == GTEST_INFO ?    "[  INFO ]" :
-      severity == GTEST_WARNING ? "[WARNING]" :
-      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
-  GetStream() << ::std::endl << marker << " "
-              << FormatFileLocation(file, line).c_str() << ": ";
-}
-
-// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
-GTestLog::~GTestLog() {
-  GetStream() << ::std::endl;
-  if (severity_ == GTEST_FATAL) {
-    fflush(stderr);
-    posix::Abort();
-  }
-}
-// Disable Microsoft deprecation warnings for POSIX functions called from
-// this class (creat, dup, dup2, and close)
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable: 4996)
-#endif  // _MSC_VER
-
-#if GTEST_HAS_STREAM_REDIRECTION
-
-// Object that captures an output stream (stdout/stderr).
-class CapturedStream {
- public:
-  // The ctor redirects the stream to a temporary file.
-  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
-# if GTEST_OS_WINDOWS
-    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
-    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
-
-    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
-    const UINT success = ::GetTempFileNameA(temp_dir_path,
-                                            "gtest_redir",
-                                            0,  // Generate unique file name.
-                                            temp_file_path);
-    GTEST_CHECK_(success != 0)
-        << "Unable to create a temporary file in " << temp_dir_path;
-    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
-    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
-                                    << temp_file_path;
-    filename_ = temp_file_path;
-# else
-    // There's no guarantee that a test has write access to the current
-    // directory, so we create the temporary file in the /tmp directory
-    // instead. We use /tmp on most systems, and /sdcard on Android.
-    // That's because Android doesn't have /tmp.
-#  if GTEST_OS_LINUX_ANDROID
-    // Note: Android applications are expected to call the framework's
-    // Context.getExternalStorageDirectory() method through JNI to get
-    // the location of the world-writable SD Card directory. However,
-    // this requires a Context handle, which cannot be retrieved
-    // globally from native code. Doing so also precludes running the
-    // code as part of a regular standalone executable, which doesn't
-    // run in a Dalvik process (e.g. when running it through 'adb shell').
-    //
-    // The location /sdcard is directly accessible from native code
-    // and is the only location (unofficially) supported by the Android
-    // team. It's generally a symlink to the real SD Card mount point
-    // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or
-    // other OEM-customized locations. Never rely on these, and always
-    // use /sdcard.
-    char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX";
-#  else
-    char name_template[] = "/tmp/captured_stream.XXXXXX";
-#  endif  // GTEST_OS_LINUX_ANDROID
-    const int captured_fd = mkstemp(name_template);
-    filename_ = name_template;
-# endif  // GTEST_OS_WINDOWS
-    fflush(NULL);
-    dup2(captured_fd, fd_);
-    close(captured_fd);
-  }
-
-  ~CapturedStream() {
-    remove(filename_.c_str());
-  }
-
-  std::string GetCapturedString() {
-    if (uncaptured_fd_ != -1) {
-      // Restores the original stream.
-      fflush(NULL);
-      dup2(uncaptured_fd_, fd_);
-      close(uncaptured_fd_);
-      uncaptured_fd_ = -1;
-    }
-
-    FILE* const file = posix::FOpen(filename_.c_str(), "r");
-    const std::string content = ReadEntireFile(file);
-    posix::FClose(file);
-    return content;
-  }
-
- private:
-  // Reads the entire content of a file as an std::string.
-  static std::string ReadEntireFile(FILE* file);
-
-  // Returns the size (in bytes) of a file.
-  static size_t GetFileSize(FILE* file);
-
-  const int fd_;  // A stream to capture.
-  int uncaptured_fd_;
-  // Name of the temporary file holding the stderr output.
-  ::std::string filename_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
-};
-
-// Returns the size (in bytes) of a file.
-size_t CapturedStream::GetFileSize(FILE* file) {
-  fseek(file, 0, SEEK_END);
-  return static_cast<size_t>(ftell(file));
-}
-
-// Reads the entire content of a file as a string.
-std::string CapturedStream::ReadEntireFile(FILE* file) {
-  const size_t file_size = GetFileSize(file);
-  char* const buffer = new char[file_size];
-
-  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
-  size_t bytes_read = 0;       // # of bytes read so far
-
-  fseek(file, 0, SEEK_SET);
-
-  // Keeps reading the file until we cannot read further or the
-  // pre-determined file size is reached.
-  do {
-    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
-    bytes_read += bytes_last_read;
-  } while (bytes_last_read > 0 && bytes_read < file_size);
-
-  const std::string content(buffer, bytes_read);
-  delete[] buffer;
-
-  return content;
-}
-
-# ifdef _MSC_VER
-#  pragma warning(pop)
-# endif  // _MSC_VER
-
-static CapturedStream* g_captured_stderr = NULL;
-static CapturedStream* g_captured_stdout = NULL;
-
-// Starts capturing an output stream (stdout/stderr).
-void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
-  if (*stream != NULL) {
-    GTEST_LOG_(FATAL) << "Only one " << stream_name
-                      << " capturer can exist at a time.";
-  }
-  *stream = new CapturedStream(fd);
-}
-
-// Stops capturing the output stream and returns the captured string.
-std::string GetCapturedStream(CapturedStream** captured_stream) {
-  const std::string content = (*captured_stream)->GetCapturedString();
-
-  delete *captured_stream;
-  *captured_stream = NULL;
-
-  return content;
-}
-
-// Starts capturing stdout.
-void CaptureStdout() {
-  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
-}
-
-// Starts capturing stderr.
-void CaptureStderr() {
-  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
-}
-
-// Stops capturing stdout and returns the captured string.
-std::string GetCapturedStdout() {
-  return GetCapturedStream(&g_captured_stdout);
-}
-
-// Stops capturing stderr and returns the captured string.
-std::string GetCapturedStderr() {
-  return GetCapturedStream(&g_captured_stderr);
-}
-
-#endif  // GTEST_HAS_STREAM_REDIRECTION
-
-#if GTEST_HAS_DEATH_TEST
-
-// A copy of all command line arguments.  Set by InitGoogleTest().
-::std::vector<testing::internal::string> g_argvs;
-
-static const ::std::vector<testing::internal::string>* g_injected_test_argvs =
-                                        NULL;  // Owned.
-
-void SetInjectableArgvs(const ::std::vector<testing::internal::string>* argvs) {
-  if (g_injected_test_argvs != argvs)
-    delete g_injected_test_argvs;
-  g_injected_test_argvs = argvs;
-}
-
-const ::std::vector<testing::internal::string>& GetInjectableArgvs() {
-  if (g_injected_test_argvs != NULL) {
-    return *g_injected_test_argvs;
-  }
-  return g_argvs;
-}
-#endif  // GTEST_HAS_DEATH_TEST
-
-#if GTEST_OS_WINDOWS_MOBILE
-namespace posix {
-void Abort() {
-  DebugBreak();
-  TerminateProcess(GetCurrentProcess(), 1);
-}
-}  // namespace posix
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-// Returns the name of the environment variable corresponding to the
-// given flag.  For example, FlagToEnvVar("foo") will return
-// "GTEST_FOO" in the open-source version.
-static std::string FlagToEnvVar(const char* flag) {
-  const std::string full_flag =
-      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
-
-  Message env_var;
-  for (size_t i = 0; i != full_flag.length(); i++) {
-    env_var << ToUpper(full_flag.c_str()[i]);
-  }
-
-  return env_var.GetString();
-}
-
-// Parses 'str' for a 32-bit signed integer.  If successful, writes
-// the result to *value and returns true; otherwise leaves *value
-// unchanged and returns false.
-bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
-  // Parses the environment variable as a decimal integer.
-  char* end = NULL;
-  const long long_value = strtol(str, &end, 10);  // NOLINT
-
-  // Has strtol() consumed all characters in the string?
-  if (*end != '\0') {
-    // No - an invalid character was encountered.
-    Message msg;
-    msg << "WARNING: " << src_text
-        << " is expected to be a 32-bit integer, but actually"
-        << " has value \"" << str << "\".\n";
-    printf("%s", msg.GetString().c_str());
-    fflush(stdout);
-    return false;
-  }
-
-  // Is the parsed value in the range of an Int32?
-  const Int32 result = static_cast<Int32>(long_value);
-  if (long_value == LONG_MAX || long_value == LONG_MIN ||
-      // The parsed value overflows as a long.  (strtol() returns
-      // LONG_MAX or LONG_MIN when the input overflows.)
-      result != long_value
-      // The parsed value overflows as an Int32.
-      ) {
-    Message msg;
-    msg << "WARNING: " << src_text
-        << " is expected to be a 32-bit integer, but actually"
-        << " has value " << str << ", which overflows.\n";
-    printf("%s", msg.GetString().c_str());
-    fflush(stdout);
-    return false;
-  }
-
-  *value = result;
-  return true;
-}
-
-// Reads and returns the Boolean environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-//
-// The value is considered true iff it's not "0".
-bool BoolFromGTestEnv(const char* flag, bool default_value) {
-  const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = posix::GetEnv(env_var.c_str());
-  return string_value == NULL ?
-      default_value : strcmp(string_value, "0") != 0;
-}
-
-// Reads and returns a 32-bit integer stored in the environment
-// variable corresponding to the given flag; if it isn't set or
-// doesn't represent a valid 32-bit integer, returns default_value.
-Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
-  const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = posix::GetEnv(env_var.c_str());
-  if (string_value == NULL) {
-    // The environment variable is not set.
-    return default_value;
-  }
-
-  Int32 result = default_value;
-  if (!ParseInt32(Message() << "Environment variable " << env_var,
-                  string_value, &result)) {
-    printf("The default value %s is used.\n",
-           (Message() << default_value).GetString().c_str());
-    fflush(stdout);
-    return default_value;
-  }
-
-  return result;
-}
-
-// Reads and returns the string environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-const char* StringFromGTestEnv(const char* flag, const char* default_value) {
-  const std::string env_var = FlagToEnvVar(flag);
-  const char* const value = posix::GetEnv(env_var.c_str());
-  return value == NULL ? default_value : value;
-}
-
-}  // namespace internal
-}  // namespace testing
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Google Test - The Google C++ Testing Framework
-//
-// This file implements a universal value printer that can print a
-// value of any type T:
-//
-//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
-//
-// It uses the << operator when possible, and prints the bytes in the
-// object otherwise.  A user can override its behavior for a class
-// type Foo by defining either operator<<(::std::ostream&, const Foo&)
-// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
-// defines Foo.
-
-#include <ctype.h>
-#include <stdio.h>
-#include <ostream>  // NOLINT
-#include <string>
-
-namespace testing {
-
-namespace {
-
-using ::std::ostream;
-
-// Prints a segment of bytes in the given object.
-void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
-                                size_t count, ostream* os) {
-  char text[5] = "";
-  for (size_t i = 0; i != count; i++) {
-    const size_t j = start + i;
-    if (i != 0) {
-      // Organizes the bytes into groups of 2 for easy parsing by
-      // human.
-      if ((j % 2) == 0)
-        *os << ' ';
-      else
-        *os << '-';
-    }
-    GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
-    *os << text;
-  }
-}
-
-// Prints the bytes in the given value to the given ostream.
-void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
-                              ostream* os) {
-  // Tells the user how big the object is.
-  *os << count << "-byte object <";
-
-  const size_t kThreshold = 132;
-  const size_t kChunkSize = 64;
-  // If the object size is bigger than kThreshold, we'll have to omit
-  // some details by printing only the first and the last kChunkSize
-  // bytes.
-  // TODO(wan): let the user control the threshold using a flag.
-  if (count < kThreshold) {
-    PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
-  } else {
-    PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
-    *os << " ... ";
-    // Rounds up to 2-byte boundary.
-    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
-    PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
-  }
-  *os << ">";
-}
-
-}  // namespace
-
-namespace internal2 {
-
-// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
-// given object.  The delegation simplifies the implementation, which
-// uses the << operator and thus is easier done outside of the
-// ::testing::internal namespace, which contains a << operator that
-// sometimes conflicts with the one in STL.
-void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
-                          ostream* os) {
-  PrintBytesInObjectToImpl(obj_bytes, count, os);
-}
-
-}  // namespace internal2
-
-namespace internal {
-
-// Depending on the value of a char (or wchar_t), we print it in one
-// of three formats:
-//   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
-//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
-//   - as a special escape sequence (e.g. '\r', '\n').
-enum CharFormat {
-  kAsIs,
-  kHexEscape,
-  kSpecialEscape
-};
-
-// Returns true if c is a printable ASCII character.  We test the
-// value of c directly instead of calling isprint(), which is buggy on
-// Windows Mobile.
-inline bool IsPrintableAscii(wchar_t c) {
-  return 0x20 <= c && c <= 0x7E;
-}
-
-// Prints a wide or narrow char c as a character literal without the
-// quotes, escaping it when necessary; returns how c was formatted.
-// The template argument UnsignedChar is the unsigned version of Char,
-// which is the type of c.
-template <typename UnsignedChar, typename Char>
-static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
-  switch (static_cast<wchar_t>(c)) {
-    case L'\0':
-      *os << "\\0";
-      break;
-    case L'\'':
-      *os << "\\'";
-      break;
-    case L'\\':
-      *os << "\\\\";
-      break;
-    case L'\a':
-      *os << "\\a";
-      break;
-    case L'\b':
-      *os << "\\b";
-      break;
-    case L'\f':
-      *os << "\\f";
-      break;
-    case L'\n':
-      *os << "\\n";
-      break;
-    case L'\r':
-      *os << "\\r";
-      break;
-    case L'\t':
-      *os << "\\t";
-      break;
-    case L'\v':
-      *os << "\\v";
-      break;
-    default:
-      if (IsPrintableAscii(c)) {
-        *os << static_cast<char>(c);
-        return kAsIs;
-      } else {
-        *os << "\\x" + String::FormatHexInt(static_cast<UnsignedChar>(c));
-        return kHexEscape;
-      }
-  }
-  return kSpecialEscape;
-}
-
-// Prints a wchar_t c as if it's part of a string literal, escaping it when
-// necessary; returns how c was formatted.
-static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
-  switch (c) {
-    case L'\'':
-      *os << "'";
-      return kAsIs;
-    case L'"':
-      *os << "\\\"";
-      return kSpecialEscape;
-    default:
-      return PrintAsCharLiteralTo<wchar_t>(c, os);
-  }
-}
-
-// Prints a char c as if it's part of a string literal, escaping it when
-// necessary; returns how c was formatted.
-static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
-  return PrintAsStringLiteralTo(
-      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
-}
-
-// Prints a wide or narrow character c and its code.  '\0' is printed
-// as "'\\0'", other unprintable characters are also properly escaped
-// using the standard C++ escape sequence.  The template argument
-// UnsignedChar is the unsigned version of Char, which is the type of c.
-template <typename UnsignedChar, typename Char>
-void PrintCharAndCodeTo(Char c, ostream* os) {
-  // First, print c as a literal in the most readable form we can find.
-  *os << ((sizeof(c) > 1) ? "L'" : "'");
-  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
-  *os << "'";
-
-  // To aid user debugging, we also print c's code in decimal, unless
-  // it's 0 (in which case c was printed as '\\0', making the code
-  // obvious).
-  if (c == 0)
-    return;
-  *os << " (" << static_cast<int>(c);
-
-  // For more convenience, we print c's code again in hexidecimal,
-  // unless c was already printed in the form '\x##' or the code is in
-  // [1, 9].
-  if (format == kHexEscape || (1 <= c && c <= 9)) {
-    // Do nothing.
-  } else {
-    *os << ", 0x" << String::FormatHexInt(static_cast<UnsignedChar>(c));
-  }
-  *os << ")";
-}
-
-void PrintTo(unsigned char c, ::std::ostream* os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
-void PrintTo(signed char c, ::std::ostream* os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
-
-// Prints a wchar_t as a symbol if it is printable or as its internal
-// code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
-void PrintTo(wchar_t wc, ostream* os) {
-  PrintCharAndCodeTo<wchar_t>(wc, os);
-}
-
-// Prints the given array of characters to the ostream.  CharType must be either
-// char or wchar_t.
-// The array starts at begin, the length is len, it may include '\0' characters
-// and may not be NUL-terminated.
-template <typename CharType>
-static void PrintCharsAsStringTo(
-    const CharType* begin, size_t len, ostream* os) {
-  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
-  *os << kQuoteBegin;
-  bool is_previous_hex = false;
-  for (size_t index = 0; index < len; ++index) {
-    const CharType cur = begin[index];
-    if (is_previous_hex && IsXDigit(cur)) {
-      // Previous character is of '\x..' form and this character can be
-      // interpreted as another hexadecimal digit in its number. Break string to
-      // disambiguate.
-      *os << "\" " << kQuoteBegin;
-    }
-    is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
-  }
-  *os << "\"";
-}
-
-// Prints a (const) char/wchar_t array of 'len' elements, starting at address
-// 'begin'.  CharType must be either char or wchar_t.
-template <typename CharType>
-static void UniversalPrintCharArray(
-    const CharType* begin, size_t len, ostream* os) {
-  // The code
-  //   const char kFoo[] = "foo";
-  // generates an array of 4, not 3, elements, with the last one being '\0'.
-  //
-  // Therefore when printing a char array, we don't print the last element if
-  // it's '\0', such that the output matches the string literal as it's
-  // written in the source code.
-  if (len > 0 && begin[len - 1] == '\0') {
-    PrintCharsAsStringTo(begin, len - 1, os);
-    return;
-  }
-
-  // If, however, the last element in the array is not '\0', e.g.
-  //    const char kFoo[] = { 'f', 'o', 'o' };
-  // we must print the entire array.  We also print a message to indicate
-  // that the array is not NUL-terminated.
-  PrintCharsAsStringTo(begin, len, os);
-  *os << " (no terminating NUL)";
-}
-
-// Prints a (const) char array of 'len' elements, starting at address 'begin'.
-void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
-  UniversalPrintCharArray(begin, len, os);
-}
-
-// Prints a (const) wchar_t array of 'len' elements, starting at address
-// 'begin'.
-void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
-  UniversalPrintCharArray(begin, len, os);
-}
-
-// Prints the given C string to the ostream.
-void PrintTo(const char* s, ostream* os) {
-  if (s == NULL) {
-    *os << "NULL";
-  } else {
-    *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, strlen(s), os);
-  }
-}
-
-// MSVC compiler can be configured to define whar_t as a typedef
-// of unsigned short. Defining an overload for const wchar_t* in that case
-// would cause pointers to unsigned shorts be printed as wide strings,
-// possibly accessing more memory than intended and causing invalid
-// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
-// wchar_t is implemented as a native type.
-#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
-// Prints the given wide C string to the ostream.
-void PrintTo(const wchar_t* s, ostream* os) {
-  if (s == NULL) {
-    *os << "NULL";
-  } else {
-    *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, wcslen(s), os);
-  }
-}
-#endif  // wchar_t is native
-
-// Prints a ::string object.
-#if GTEST_HAS_GLOBAL_STRING
-void PrintStringTo(const ::string& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
-}
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-void PrintStringTo(const ::std::string& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
-}
-
-// Prints a ::wstring object.
-#if GTEST_HAS_GLOBAL_WSTRING
-void PrintWideStringTo(const ::wstring& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
-}
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-#if GTEST_HAS_STD_WSTRING
-void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-}  // namespace internal
-
-}  // namespace testing
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: mheule@google.com (Markus Heule)
-//
-// The Google C++ Testing Framework (Google Test)
-
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
-
-namespace testing {
-
-using internal::GetUnitTestImpl;
-
-// Gets the summary of the failure message by omitting the stack trace
-// in it.
-std::string TestPartResult::ExtractSummary(const char* message) {
-  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
-  return stack_trace == NULL ? message :
-      std::string(message, stack_trace);
-}
-
-// Prints a TestPartResult object.
-std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
-  return os
-      << result.file_name() << ":" << result.line_number() << ": "
-      << (result.type() == TestPartResult::kSuccess ? "Success" :
-          result.type() == TestPartResult::kFatalFailure ? "Fatal failure" :
-          "Non-fatal failure") << ":\n"
-      << result.message() << std::endl;
-}
-
-// Appends a TestPartResult to the array.
-void TestPartResultArray::Append(const TestPartResult& result) {
-  array_.push_back(result);
-}
-
-// Returns the TestPartResult at the given index (0-based).
-const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
-  if (index < 0 || index >= size()) {
-    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
-    internal::posix::Abort();
-  }
-
-  return array_[index];
-}
-
-// Returns the number of TestPartResult objects in the array.
-int TestPartResultArray::size() const {
-  return static_cast<int>(array_.size());
-}
-
-namespace internal {
-
-HasNewFatalFailureHelper::HasNewFatalFailureHelper()
-    : has_new_fatal_failure_(false),
-      original_reporter_(GetUnitTestImpl()->
-                         GetTestPartResultReporterForCurrentThread()) {
-  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
-}
-
-HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
-  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
-      original_reporter_);
-}
-
-void HasNewFatalFailureHelper::ReportTestPartResult(
-    const TestPartResult& result) {
-  if (result.fatally_failed())
-    has_new_fatal_failure_ = true;
-  original_reporter_->ReportTestPartResult(result);
-}
-
-}  // namespace internal
-
-}  // namespace testing
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-
-namespace testing {
-namespace internal {
-
-#if GTEST_HAS_TYPED_TEST_P
-
-// Skips to the first non-space char in str. Returns an empty string if str
-// contains only whitespace characters.
-static const char* SkipSpaces(const char* str) {
-  while (IsSpace(*str))
-    str++;
-  return str;
-}
-
-// Verifies that registered_tests match the test names in
-// defined_test_names_; returns registered_tests if successful, or
-// aborts the program otherwise.
-const char* TypedTestCasePState::VerifyRegisteredTestNames(
-    const char* file, int line, const char* registered_tests) {
-  typedef ::std::set<const char*>::const_iterator DefinedTestIter;
-  registered_ = true;
-
-  // Skip initial whitespace in registered_tests since some
-  // preprocessors prefix stringizied literals with whitespace.
-  registered_tests = SkipSpaces(registered_tests);
-
-  Message errors;
-  ::std::set<std::string> tests;
-  for (const char* names = registered_tests; names != NULL;
-       names = SkipComma(names)) {
-    const std::string name = GetPrefixUntilComma(names);
-    if (tests.count(name) != 0) {
-      errors << "Test " << name << " is listed more than once.\n";
-      continue;
-    }
-
-    bool found = false;
-    for (DefinedTestIter it = defined_test_names_.begin();
-         it != defined_test_names_.end();
-         ++it) {
-      if (name == *it) {
-        found = true;
-        break;
-      }
-    }
-
-    if (found) {
-      tests.insert(name);
-    } else {
-      errors << "No test named " << name
-             << " can be found in this test case.\n";
-    }
-  }
-
-  for (DefinedTestIter it = defined_test_names_.begin();
-       it != defined_test_names_.end();
-       ++it) {
-    if (tests.count(*it) == 0) {
-      errors << "You forgot to list test " << *it << ".\n";
-    }
-  }
-
-  const std::string& errors_str = errors.GetString();
-  if (errors_str != "") {
-    fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
-            errors_str.c_str());
-    fflush(stderr);
-    posix::Abort();
-  }
-
-  return registered_tests;
-}
-
-#endif  // GTEST_HAS_TYPED_TEST_P
-
-}  // namespace internal
-}  // namespace testing
+#include "src/gtest.cc"
+#include "src/gtest-death-test.cc"
+#include "src/gtest-filepath.cc"
+#include "src/gtest-port.cc"
+#include "src/gtest-printers.cc"
+#include "src/gtest-test-part.cc"
+#include "src/gtest-typed-test.cc"
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-death-test.cc b/libs/libvpx/third_party/googletest/src/src/gtest-death-test.cc
new file mode 100644
index 0000000000..a01a369830
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-death-test.cc
@@ -0,0 +1,1342 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev)
+//
+// This file implements death tests.
+
+#include "gtest/gtest-death-test.h"
+#include "gtest/internal/gtest-port.h"
+#include "gtest/internal/custom/gtest.h"
+
+#if GTEST_HAS_DEATH_TEST
+
+# if GTEST_OS_MAC
+#  include <crt_externs.h>
+# endif  // GTEST_OS_MAC
+
+# include <errno.h>
+# include <fcntl.h>
+# include <limits.h>
+
+# if GTEST_OS_LINUX
+#  include <signal.h>
+# endif  // GTEST_OS_LINUX
+
+# include <stdarg.h>
+
+# if GTEST_OS_WINDOWS
+#  include <windows.h>
+# else
+#  include <sys/mman.h>
+#  include <sys/wait.h>
+# endif  // GTEST_OS_WINDOWS
+
+# if GTEST_OS_QNX
+#  include <spawn.h>
+# endif  // GTEST_OS_QNX
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-string.h"
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick exists to
+// prevent the accidental inclusion of gtest-internal-inl.h in the
+// user's code.
+#define GTEST_IMPLEMENTATION_ 1
+#include "src/gtest-internal-inl.h"
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+
+// Constants.
+
+// The default death test style.
+static const char kDefaultDeathTestStyle[] = "fast";
+
+GTEST_DEFINE_string_(
+    death_test_style,
+    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+    "Indicates how to run a death test in a forked child process: "
+    "\"threadsafe\" (child process re-executes the test binary "
+    "from the beginning, running only the specific death test) or "
+    "\"fast\" (child process runs the death test immediately "
+    "after forking).");
+
+GTEST_DEFINE_bool_(
+    death_test_use_fork,
+    internal::BoolFromGTestEnv("death_test_use_fork", false),
+    "Instructs to use fork()/_exit() instead of clone() in death tests. "
+    "Ignored and always uses fork() on POSIX systems where clone() is not "
+    "implemented. Useful when running under valgrind or similar tools if "
+    "those do not support clone(). Valgrind 3.3.1 will just fail if "
+    "it sees an unsupported combination of clone() flags. "
+    "It is not recommended to use this flag w/o valgrind though it will "
+    "work in 99% of the cases. Once valgrind is fixed, this flag will "
+    "most likely be removed.");
+
+namespace internal {
+GTEST_DEFINE_string_(
+    internal_run_death_test, "",
+    "Indicates the file, line number, temporal index of "
+    "the single death test to run, and a file descriptor to "
+    "which a success code may be sent, all separated by "
+    "the '|' characters.  This flag is specified if and only if the current "
+    "process is a sub-process launched for running a thread-safe "
+    "death test.  FOR INTERNAL USE ONLY.");
+}  // namespace internal
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Valid only for fast death tests. Indicates the code is running in the
+// child process of a fast style death test.
+# if !GTEST_OS_WINDOWS
+static bool g_in_fast_death_test_child = false;
+# endif
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+bool InDeathTestChild() {
+# if GTEST_OS_WINDOWS
+
+  // On Windows, death tests are thread-safe regardless of the value of the
+  // death_test_style flag.
+  return !GTEST_FLAG(internal_run_death_test).empty();
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe")
+    return !GTEST_FLAG(internal_run_death_test).empty();
+  else
+    return g_in_fast_death_test_child;
+#endif
+}
+
+}  // namespace internal
+
+// ExitedWithCode constructor.
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
+}
+
+// ExitedWithCode function-call operator.
+bool ExitedWithCode::operator()(int exit_status) const {
+# if GTEST_OS_WINDOWS
+
+  return exit_status == exit_code_;
+
+# else
+
+  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
+
+# endif  // GTEST_OS_WINDOWS
+}
+
+# if !GTEST_OS_WINDOWS
+// KilledBySignal constructor.
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
+}
+
+// KilledBySignal function-call operator.
+bool KilledBySignal::operator()(int exit_status) const {
+#  if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  {
+    bool result;
+    if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
+      return result;
+    }
+  }
+#  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
+}
+# endif  // !GTEST_OS_WINDOWS
+
+namespace internal {
+
+// Utilities needed for death tests.
+
+// Generates a textual description of a given exit code, in the format
+// specified by wait(2).
+static std::string ExitSummary(int exit_code) {
+  Message m;
+
+# if GTEST_OS_WINDOWS
+
+  m << "Exited with exit status " << exit_code;
+
+# else
+
+  if (WIFEXITED(exit_code)) {
+    m << "Exited with exit status " << WEXITSTATUS(exit_code);
+  } else if (WIFSIGNALED(exit_code)) {
+    m << "Terminated by signal " << WTERMSIG(exit_code);
+  }
+#  ifdef WCOREDUMP
+  if (WCOREDUMP(exit_code)) {
+    m << " (core dumped)";
+  }
+#  endif
+# endif  // GTEST_OS_WINDOWS
+
+  return m.GetString();
+}
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+bool ExitedUnsuccessfully(int exit_status) {
+  return !ExitedWithCode(0)(exit_status);
+}
+
+# if !GTEST_OS_WINDOWS
+// Generates a textual failure message when a death test finds more than
+// one thread running, or cannot determine the number of threads, prior
+// to executing the given statement.  It is the responsibility of the
+// caller not to pass a thread_count of 1.
+static std::string DeathTestThreadWarning(size_t thread_count) {
+  Message msg;
+  msg << "Death tests use fork(), which is unsafe particularly"
+      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
+  if (thread_count == 0)
+    msg << "couldn't detect the number of threads.";
+  else
+    msg << "detected " << thread_count << " threads.";
+  return msg.GetString();
+}
+# endif  // !GTEST_OS_WINDOWS
+
+// Flag characters for reporting a death test that did not die.
+static const char kDeathTestLived = 'L';
+static const char kDeathTestReturned = 'R';
+static const char kDeathTestThrew = 'T';
+static const char kDeathTestInternalError = 'I';
+
+// An enumeration describing all of the possible ways that a death test can
+// conclude.  DIED means that the process died while executing the test
+// code; LIVED means that process lived beyond the end of the test code;
+// RETURNED means that the test statement attempted to execute a return
+// statement, which is not allowed; THREW means that the test statement
+// returned control by throwing an exception.  IN_PROGRESS means the test
+// has not yet concluded.
+// TODO(vladl@google.com): Unify names and possibly values for
+// AbortReason, DeathTestOutcome, and flag characters above.
+enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
+
+// Routine for aborting the program which is safe to call from an
+// exec-style death test child process, in which case the error
+// message is propagated back to the parent process.  Otherwise, the
+// message is simply printed to stderr.  In either case, the program
+// then exits with status 1.
+void DeathTestAbort(const std::string& message) {
+  // On a POSIX system, this function may be called from a threadsafe-style
+  // death test child process, which operates on a very small stack.  Use
+  // the heap for any additional non-minuscule memory requirements.
+  const InternalRunDeathTestFlag* const flag =
+      GetUnitTestImpl()->internal_run_death_test_flag();
+  if (flag != NULL) {
+    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
+    fputc(kDeathTestInternalError, parent);
+    fprintf(parent, "%s", message.c_str());
+    fflush(parent);
+    _exit(1);
+  } else {
+    fprintf(stderr, "%s", message.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// A replacement for CHECK that calls DeathTestAbort if the assertion
+// fails.
+# define GTEST_DEATH_TEST_CHECK_(expression) \
+  do { \
+    if (!::testing::internal::IsTrue(expression)) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
+// evaluating any system call that fulfills two conditions: it must return
+// -1 on failure, and set errno to EINTR when it is interrupted and
+// should be tried again.  The macro expands to a loop that repeatedly
+// evaluates the expression as long as it evaluates to -1 and sets
+// errno to EINTR.  If the expression evaluates to -1 but errno is
+// something other than EINTR, DeathTestAbort is called.
+# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
+  do { \
+    int gtest_retval; \
+    do { \
+      gtest_retval = (expression); \
+    } while (gtest_retval == -1 && errno == EINTR); \
+    if (gtest_retval == -1) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression + " != -1"); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// Returns the message describing the last system error in errno.
+std::string GetLastErrnoDescription() {
+    return errno == 0 ? "" : posix::StrError(errno);
+}
+
+// This is called from a death test parent process to read a failure
+// message from the death test child process and log it with the FATAL
+// severity. On Windows, the message is read from a pipe handle. On other
+// platforms, it is read from a file descriptor.
+static void FailFromInternalError(int fd) {
+  Message error;
+  char buffer[256];
+  int num_read;
+
+  do {
+    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
+      buffer[num_read] = '\0';
+      error << buffer;
+    }
+  } while (num_read == -1 && errno == EINTR);
+
+  if (num_read == 0) {
+    GTEST_LOG_(FATAL) << error.GetString();
+  } else {
+    const int last_error = errno;
+    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
+                      << GetLastErrnoDescription() << " [" << last_error << "]";
+  }
+}
+
+// Death test constructor.  Increments the running death test count
+// for the current test.
+DeathTest::DeathTest() {
+  TestInfo* const info = GetUnitTestImpl()->current_test_info();
+  if (info == NULL) {
+    DeathTestAbort("Cannot run a death test outside of a TEST or "
+                   "TEST_F construct");
+  }
+}
+
+// Creates and returns a death test by dispatching to the current
+// death test factory.
+bool DeathTest::Create(const char* statement, const RE* regex,
+                       const char* file, int line, DeathTest** test) {
+  return GetUnitTestImpl()->death_test_factory()->Create(
+      statement, regex, file, line, test);
+}
+
+const char* DeathTest::LastMessage() {
+  return last_death_test_message_.c_str();
+}
+
+void DeathTest::set_last_death_test_message(const std::string& message) {
+  last_death_test_message_ = message;
+}
+
+std::string DeathTest::last_death_test_message_;
+
+// Provides cross platform implementation for some death functionality.
+class DeathTestImpl : public DeathTest {
+ protected:
+  DeathTestImpl(const char* a_statement, const RE* a_regex)
+      : statement_(a_statement),
+        regex_(a_regex),
+        spawned_(false),
+        status_(-1),
+        outcome_(IN_PROGRESS),
+        read_fd_(-1),
+        write_fd_(-1) {}
+
+  // read_fd_ is expected to be closed and cleared by a derived class.
+  ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
+
+  void Abort(AbortReason reason);
+  virtual bool Passed(bool status_ok);
+
+  const char* statement() const { return statement_; }
+  const RE* regex() const { return regex_; }
+  bool spawned() const { return spawned_; }
+  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
+  int status() const { return status_; }
+  void set_status(int a_status) { status_ = a_status; }
+  DeathTestOutcome outcome() const { return outcome_; }
+  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
+  int read_fd() const { return read_fd_; }
+  void set_read_fd(int fd) { read_fd_ = fd; }
+  int write_fd() const { return write_fd_; }
+  void set_write_fd(int fd) { write_fd_ = fd; }
+
+  // Called in the parent process only. Reads the result code of the death
+  // test child process via a pipe, interprets it to set the outcome_
+  // member, and closes read_fd_.  Outputs diagnostics and terminates in
+  // case of unexpected codes.
+  void ReadAndInterpretStatusByte();
+
+ private:
+  // The textual content of the code this object is testing.  This class
+  // doesn't own this string and should not attempt to delete it.
+  const char* const statement_;
+  // The regular expression which test output must match.  DeathTestImpl
+  // doesn't own this object and should not attempt to delete it.
+  const RE* const regex_;
+  // True if the death test child process has been successfully spawned.
+  bool spawned_;
+  // The exit status of the child process.
+  int status_;
+  // How the death test concluded.
+  DeathTestOutcome outcome_;
+  // Descriptor to the read end of the pipe to the child process.  It is
+  // always -1 in the child process.  The child keeps its write end of the
+  // pipe in write_fd_.
+  int read_fd_;
+  // Descriptor to the child's write end of the pipe to the parent process.
+  // It is always -1 in the parent process.  The parent keeps its end of the
+  // pipe in read_fd_.
+  int write_fd_;
+};
+
+// Called in the parent process only. Reads the result code of the death
+// test child process via a pipe, interprets it to set the outcome_
+// member, and closes read_fd_.  Outputs diagnostics and terminates in
+// case of unexpected codes.
+void DeathTestImpl::ReadAndInterpretStatusByte() {
+  char flag;
+  int bytes_read;
+
+  // The read() here blocks until data is available (signifying the
+  // failure of the death test) or until the pipe is closed (signifying
+  // its success), so it's okay to call this in the parent before
+  // the child process has exited.
+  do {
+    bytes_read = posix::Read(read_fd(), &flag, 1);
+  } while (bytes_read == -1 && errno == EINTR);
+
+  if (bytes_read == 0) {
+    set_outcome(DIED);
+  } else if (bytes_read == 1) {
+    switch (flag) {
+      case kDeathTestReturned:
+        set_outcome(RETURNED);
+        break;
+      case kDeathTestThrew:
+        set_outcome(THREW);
+        break;
+      case kDeathTestLived:
+        set_outcome(LIVED);
+        break;
+      case kDeathTestInternalError:
+        FailFromInternalError(read_fd());  // Does not return.
+        break;
+      default:
+        GTEST_LOG_(FATAL) << "Death test child process reported "
+                          << "unexpected status byte ("
+                          << static_cast<unsigned int>(flag) << ")";
+    }
+  } else {
+    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
+                      << GetLastErrnoDescription();
+  }
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
+  set_read_fd(-1);
+}
+
+// Signals that the death test code which should have exited, didn't.
+// Should be called only in a death test child process.
+// Writes a status byte to the child's status file descriptor, then
+// calls _exit(1).
+void DeathTestImpl::Abort(AbortReason reason) {
+  // The parent process considers the death test to be a failure if
+  // it finds any data in our pipe.  So, here we write a single flag byte
+  // to the pipe, then exit.
+  const char status_ch =
+      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
+      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
+  // We are leaking the descriptor here because on some platforms (i.e.,
+  // when built as Windows DLL), destructors of global objects will still
+  // run after calling _exit(). On such systems, write_fd_ will be
+  // indirectly closed from the destructor of UnitTestImpl, causing double
+  // close if it is also closed here. On debug configurations, double close
+  // may assert. As there are no in-process buffers to flush here, we are
+  // relying on the OS to close the descriptor after the process terminates
+  // when the destructors are not run.
+  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
+}
+
+// Returns an indented copy of stderr output for a death test.
+// This makes distinguishing death test output lines from regular log lines
+// much easier.
+static ::std::string FormatDeathTestOutput(const ::std::string& output) {
+  ::std::string ret;
+  for (size_t at = 0; ; ) {
+    const size_t line_end = output.find('\n', at);
+    ret += "[  DEATH   ] ";
+    if (line_end == ::std::string::npos) {
+      ret += output.substr(at);
+      break;
+    }
+    ret += output.substr(at, line_end + 1 - at);
+    at = line_end + 1;
+  }
+  return ret;
+}
+
+// Assesses the success or failure of a death test, using both private
+// members which have previously been set, and one argument:
+//
+// Private data members:
+//   outcome:  An enumeration describing how the death test
+//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
+//             fails in the latter three cases.
+//   status:   The exit status of the child process. On *nix, it is in the
+//             in the format specified by wait(2). On Windows, this is the
+//             value supplied to the ExitProcess() API or a numeric code
+//             of the exception that terminated the program.
+//   regex:    A regular expression object to be applied to
+//             the test's captured standard error output; the death test
+//             fails if it does not match.
+//
+// Argument:
+//   status_ok: true if exit_status is acceptable in the context of
+//              this particular death test, which fails if it is false
+//
+// Returns true iff all of the above conditions are met.  Otherwise, the
+// first failing condition, in the order given above, is the one that is
+// reported. Also sets the last death test message string.
+bool DeathTestImpl::Passed(bool status_ok) {
+  if (!spawned())
+    return false;
+
+  const std::string error_message = GetCapturedStderr();
+
+  bool success = false;
+  Message buffer;
+
+  buffer << "Death test: " << statement() << "\n";
+  switch (outcome()) {
+    case LIVED:
+      buffer << "    Result: failed to die.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case THREW:
+      buffer << "    Result: threw an exception.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case RETURNED:
+      buffer << "    Result: illegal return in test statement.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case DIED:
+      if (status_ok) {
+        const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
+        if (matched) {
+          success = true;
+        } else {
+          buffer << "    Result: died but not with expected error.\n"
+                 << "  Expected: " << regex()->pattern() << "\n"
+                 << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+        }
+      } else {
+        buffer << "    Result: died but not with expected exit code:\n"
+               << "            " << ExitSummary(status()) << "\n"
+               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+      }
+      break;
+    case IN_PROGRESS:
+    default:
+      GTEST_LOG_(FATAL)
+          << "DeathTest::Passed somehow called before conclusion of test";
+  }
+
+  DeathTest::set_last_death_test_message(buffer.GetString());
+  return success;
+}
+
+# if GTEST_OS_WINDOWS
+// WindowsDeathTest implements death tests on Windows. Due to the
+// specifics of starting new processes on Windows, death tests there are
+// always threadsafe, and Google Test considers the
+// --gtest_death_test_style=fast setting to be equivalent to
+// --gtest_death_test_style=threadsafe there.
+//
+// A few implementation notes:  Like the Linux version, the Windows
+// implementation uses pipes for child-to-parent communication. But due to
+// the specifics of pipes on Windows, some extra steps are required:
+//
+// 1. The parent creates a communication pipe and stores handles to both
+//    ends of it.
+// 2. The parent starts the child and provides it with the information
+//    necessary to acquire the handle to the write end of the pipe.
+// 3. The child acquires the write end of the pipe and signals the parent
+//    using a Windows event.
+// 4. Now the parent can release the write end of the pipe on its side. If
+//    this is done before step 3, the object's reference count goes down to
+//    0 and it is destroyed, preventing the child from acquiring it. The
+//    parent now has to release it, or read operations on the read end of
+//    the pipe will not return when the child terminates.
+// 5. The parent reads child's output through the pipe (outcome code and
+//    any possible error messages) from the pipe, and its stderr and then
+//    determines whether to fail the test.
+//
+// Note: to distinguish Win32 API calls from the local method and function
+// calls, the former are explicitly resolved in the global namespace.
+//
+class WindowsDeathTest : public DeathTestImpl {
+ public:
+  WindowsDeathTest(const char* a_statement,
+                   const RE* a_regex,
+                   const char* file,
+                   int line)
+      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // Handle to the write end of the pipe to the child process.
+  AutoHandle write_handle_;
+  // Child process handle.
+  AutoHandle child_handle_;
+  // Event the child process uses to signal the parent that it has
+  // acquired the handle to the write end of the pipe. After seeing this
+  // event the parent can release its own handles to make sure its
+  // ReadFile() calls return when the child terminates.
+  AutoHandle event_handle_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int WindowsDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  // Wait until the child either signals that it has acquired the write end
+  // of the pipe or it dies.
+  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
+  switch (::WaitForMultipleObjects(2,
+                                   wait_handles,
+                                   FALSE,  // Waits for any of the handles.
+                                   INFINITE)) {
+    case WAIT_OBJECT_0:
+    case WAIT_OBJECT_0 + 1:
+      break;
+    default:
+      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
+  }
+
+  // The child has acquired the write end of the pipe or exited.
+  // We release the handle on our side and continue.
+  write_handle_.Reset();
+  event_handle_.Reset();
+
+  ReadAndInterpretStatusByte();
+
+  // Waits for the child process to exit if it haven't already. This
+  // returns immediately if the child has already exited, regardless of
+  // whether previous calls to WaitForMultipleObjects synchronized on this
+  // handle or not.
+  GTEST_DEATH_TEST_CHECK_(
+      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
+                                             INFINITE));
+  DWORD status_code;
+  GTEST_DEATH_TEST_CHECK_(
+      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
+  child_handle_.Reset();
+  set_status(static_cast<int>(status_code));
+  return status();
+}
+
+// The AssumeRole process for a Windows death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole WindowsDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  // WindowsDeathTest uses an anonymous pipe to communicate results of
+  // a death test.
+  SECURITY_ATTRIBUTES handles_are_inheritable = {
+    sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
+  HANDLE read_handle, write_handle;
+  GTEST_DEATH_TEST_CHECK_(
+      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
+                   0)  // Default buffer size.
+      != FALSE);
+  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
+                                O_RDONLY));
+  write_handle_.Reset(write_handle);
+  event_handle_.Reset(::CreateEvent(
+      &handles_are_inheritable,
+      TRUE,    // The event will automatically reset to non-signaled state.
+      FALSE,   // The initial state is non-signalled.
+      NULL));  // The even is unnamed.
+  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL);
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" +
+      info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
+      "=" + file_ + "|" + StreamableToString(line_) + "|" +
+      StreamableToString(death_test_index) + "|" +
+      StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
+      // size_t has the same width as pointers on both 32-bit and 64-bit
+      // Windows platforms.
+      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
+      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+
+  char executable_path[_MAX_PATH + 1];  // NOLINT
+  GTEST_DEATH_TEST_CHECK_(
+      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
+                                            executable_path,
+                                            _MAX_PATH));
+
+  std::string command_line =
+      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
+      internal_flag + "\"";
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // The child process will share the standard handles with the parent.
+  STARTUPINFOA startup_info;
+  memset(&startup_info, 0, sizeof(STARTUPINFO));
+  startup_info.dwFlags = STARTF_USESTDHANDLES;
+  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
+  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
+  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+
+  PROCESS_INFORMATION process_info;
+  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
+      executable_path,
+      const_cast<char*>(command_line.c_str()),
+      NULL,   // Retuned process handle is not inheritable.
+      NULL,   // Retuned thread handle is not inheritable.
+      TRUE,   // Child inherits all inheritable handles (for write_handle_).
+      0x0,    // Default creation flags.
+      NULL,   // Inherit the parent's environment.
+      UnitTest::GetInstance()->original_working_dir(),
+      &startup_info,
+      &process_info) != FALSE);
+  child_handle_.Reset(process_info.hProcess);
+  ::CloseHandle(process_info.hThread);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+# else  // We are not on Windows.
+
+// ForkingDeathTest provides implementations for most of the abstract
+// methods of the DeathTest interface.  Only the AssumeRole method is
+// left undefined.
+class ForkingDeathTest : public DeathTestImpl {
+ public:
+  ForkingDeathTest(const char* statement, const RE* regex);
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+
+ protected:
+  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
+
+ private:
+  // PID of child process during death test; 0 in the child process itself.
+  pid_t child_pid_;
+};
+
+// Constructs a ForkingDeathTest.
+ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex)
+    : DeathTestImpl(a_statement, a_regex),
+      child_pid_(-1) {}
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int ForkingDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  ReadAndInterpretStatusByte();
+
+  int status_value;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
+  set_status(status_value);
+  return status_value;
+}
+
+// A concrete death test class that forks, then immediately runs the test
+// in the child process.
+class NoExecDeathTest : public ForkingDeathTest {
+ public:
+  NoExecDeathTest(const char* a_statement, const RE* a_regex) :
+      ForkingDeathTest(a_statement, a_regex) { }
+  virtual TestRole AssumeRole();
+};
+
+// The AssumeRole process for a fork-and-run death test.  It implements a
+// straightforward fork, with a simple pipe to transmit the status byte.
+DeathTest::TestRole NoExecDeathTest::AssumeRole() {
+  const size_t thread_count = GetThreadCount();
+  if (thread_count != 1) {
+    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+
+  DeathTest::set_last_death_test_message("");
+  CaptureStderr();
+  // When we fork the process below, the log file buffers are copied, but the
+  // file descriptors are shared.  We flush all log files here so that closing
+  // the file descriptors in the child process doesn't throw off the
+  // synchronization between descriptors and buffers in the parent process.
+  // This is as close to the fork as possible to avoid a race condition in case
+  // there are multiple threads running before the death test, and another
+  // thread writes to the log file.
+  FlushInfoLog();
+
+  const pid_t child_pid = fork();
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  set_child_pid(child_pid);
+  if (child_pid == 0) {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
+    set_write_fd(pipe_fd[1]);
+    // Redirects all logging to stderr in the child process to prevent
+    // concurrent writes to the log files.  We capture stderr in the parent
+    // process and append the child process' output to a log.
+    LogToStderr();
+    // Event forwarding to the listeners of event listener API mush be shut
+    // down in death test subprocesses.
+    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
+    g_in_fast_death_test_child = true;
+    return EXECUTE_TEST;
+  } else {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+    set_read_fd(pipe_fd[0]);
+    set_spawned(true);
+    return OVERSEE_TEST;
+  }
+}
+
+// A concrete death test class that forks and re-executes the main
+// program from the beginning, with command-line flags set that cause
+// only this specific death test to be run.
+class ExecDeathTest : public ForkingDeathTest {
+ public:
+  ExecDeathTest(const char* a_statement, const RE* a_regex,
+                const char* file, int line) :
+      ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
+  virtual TestRole AssumeRole();
+ private:
+  static ::std::vector<testing::internal::string>
+  GetArgvsForDeathTestChildProcess() {
+    ::std::vector<testing::internal::string> args = GetInjectableArgvs();
+#  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    ::std::vector<testing::internal::string> extra_args =
+        GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
+    args.insert(args.end(), extra_args.begin(), extra_args.end());
+#  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    return args;
+  }
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() {
+    args_.push_back(NULL);
+  }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end();
+         ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() {
+    return &args_[0];
+  }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// A struct that encompasses the arguments to the child process of a
+// threadsafe-style death test process.
+struct ExecDeathTestArgs {
+  char* const* argv;  // Command-line arguments for the child's call to exec
+  int close_fd;       // File descriptor to close; the read end of a pipe
+};
+
+#  if GTEST_OS_MAC
+inline char** GetEnviron() {
+  // When Google Test is built as a framework on MacOS X, the environ variable
+  // is unavailable. Apple's documentation (man environ) recommends using
+  // _NSGetEnviron() instead.
+  return *_NSGetEnviron();
+}
+#  else
+// Some POSIX platforms expect you to declare environ. extern "C" makes
+// it reside in the global namespace.
+extern "C" char** environ;
+inline char** GetEnviron() { return environ; }
+#  endif  // GTEST_OS_MAC
+
+#  if !GTEST_OS_QNX
+// The main function for a threadsafe-style death test child process.
+// This function is called in a clone()-ed process and thus must avoid
+// any potentially unsafe operations like malloc or libc functions.
+static int ExecDeathTestChildMain(void* child_arg) {
+  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
+
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  // We can safely call execve() as it's a direct system call.  We
+  // cannot use execvp() as it's a libc function and thus potentially
+  // unsafe.  Since execve() doesn't search the PATH, the user must
+  // invoke the test program via a valid path that contains at least
+  // one path separator.
+  execve(args->argv[0], args->argv, GetEnviron());
+  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
+                 original_dir + " failed: " +
+                 GetLastErrnoDescription());
+  return EXIT_FAILURE;
+}
+#  endif  // !GTEST_OS_QNX
+
+// Two utility routines that together determine the direction the stack
+// grows.
+// This could be accomplished more elegantly by a single recursive
+// function, but we want to guard against the unlikely possibility of
+// a smart compiler optimizing the recursion away.
+//
+// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
+// StackLowerThanAddress into StackGrowsDown, which then doesn't give
+// correct answer.
+void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_;
+void StackLowerThanAddress(const void* ptr, bool* result) {
+  int dummy;
+  *result = (&dummy < ptr);
+}
+
+// Make sure AddressSanitizer does not tamper with the stack here.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+bool StackGrowsDown() {
+  int dummy;
+  bool result;
+  StackLowerThanAddress(&dummy, &result);
+  return result;
+}
+
+// Spawns a child process with the same executable as the current process in
+// a thread-safe manner and instructs it to run the death test.  The
+// implementation uses fork(2) + exec.  On systems where clone(2) is
+// available, it is used instead, being slightly more thread-safe.  On QNX,
+// fork supports only single-threaded environments, so this function uses
+// spawn(2) there instead.  The function dies with an error message if
+// anything goes wrong.
+static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
+  ExecDeathTestArgs args = { argv, close_fd };
+  pid_t child_pid = -1;
+
+#  if GTEST_OS_QNX
+  // Obtains the current directory and sets it to be closed in the child
+  // process.
+  const int cwd_fd = open(".", O_RDONLY);
+  GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  int fd_flags;
+  // Set close_fd to be closed after spawn.
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
+                                        fd_flags | FD_CLOEXEC));
+  struct inheritance inherit = {0};
+  // spawn is a system call.
+  child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron());
+  // Restores the current working directory.
+  GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
+
+#  else   // GTEST_OS_QNX
+#   if GTEST_OS_LINUX
+  // When a SIGPROF signal is received while fork() or clone() are executing,
+  // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
+  // it after the call to fork()/clone() is complete.
+  struct sigaction saved_sigprof_action;
+  struct sigaction ignore_sigprof_action;
+  memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
+  sigemptyset(&ignore_sigprof_action.sa_mask);
+  ignore_sigprof_action.sa_handler = SIG_IGN;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
+      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#   endif  // GTEST_OS_LINUX
+
+#   if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG(death_test_use_fork);
+
+  if (!use_fork) {
+    static const bool stack_grows_down = StackGrowsDown();
+    const size_t stack_size = getpagesize();
+    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
+    void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+                             MAP_ANON | MAP_PRIVATE, -1, 0);
+    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
+
+    // Maximum stack alignment in bytes:  For a downward-growing stack, this
+    // amount is subtracted from size of the stack space to get an address
+    // that is within the stack space and is aligned on all systems we care
+    // about.  As far as I know there is no ABI with stack alignment greater
+    // than 64.  We assume stack and stack_size already have alignment of
+    // kMaxStackAlignment.
+    const size_t kMaxStackAlignment = 64;
+    void* const stack_top =
+        static_cast<char*>(stack) +
+            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+    GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment &&
+        reinterpret_cast<intptr_t>(stack_top) % kMaxStackAlignment == 0);
+
+    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
+
+    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
+  }
+#   else
+  const bool use_fork = true;
+#   endif  // GTEST_HAS_CLONE
+
+  if (use_fork && (child_pid = fork()) == 0) {
+      ExecDeathTestChildMain(&args);
+      _exit(0);
+  }
+#  endif  // GTEST_OS_QNX
+#  if GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &saved_sigprof_action, NULL));
+#  endif  // GTEST_OS_LINUX
+
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  return child_pid;
+}
+
+// The AssumeRole process for a fork-and-exec death test.  It re-executes the
+// main program from the beginning, setting the --gtest_filter
+// and --gtest_internal_run_death_test flags to cause only the current
+// death test to be re-run.
+DeathTest::TestRole ExecDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+  // Clear the close-on-exec flag on the write end of the pipe, lest
+  // it be closed when the child process does an exec:
+  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
+
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
+      + info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|" + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index) + "|"
+      + StreamableToString(pipe_fd[1]);
+  Arguments args;
+  args.AddArguments(GetArgvsForDeathTestChildProcess());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // See the comment in NoExecDeathTest::AssumeRole for why the next line
+  // is necessary.
+  FlushInfoLog();
+
+  const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+  set_child_pid(child_pid);
+  set_read_fd(pipe_fd[0]);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+# endif  // !GTEST_OS_WINDOWS
+
+// Creates a concrete DeathTest-derived class that depends on the
+// --gtest_death_test_style flag, and sets the pointer pointed to
+// by the "test" argument to its address.  If the test should be
+// skipped, sets that pointer to NULL.  Returns true, unless the
+// flag is set to an invalid value.
+bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
+                                     const char* file, int line,
+                                     DeathTest** test) {
+  UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const int death_test_index = impl->current_test_info()
+      ->increment_death_test_count();
+
+  if (flag != NULL) {
+    if (death_test_index > flag->index()) {
+      DeathTest::set_last_death_test_message(
+          "Death test count (" + StreamableToString(death_test_index)
+          + ") somehow exceeded expected maximum ("
+          + StreamableToString(flag->index()) + ")");
+      return false;
+    }
+
+    if (!(flag->file() == file && flag->line() == line &&
+          flag->index() == death_test_index)) {
+      *test = NULL;
+      return true;
+    }
+  }
+
+# if GTEST_OS_WINDOWS
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new WindowsDeathTest(statement, regex, file, line);
+  }
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe") {
+    *test = new ExecDeathTest(statement, regex, file, line);
+  } else if (GTEST_FLAG(death_test_style) == "fast") {
+    *test = new NoExecDeathTest(statement, regex);
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
+    DeathTest::set_last_death_test_message(
+        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
+        + "\" encountered");
+    return false;
+  }
+
+  return true;
+}
+
+# if GTEST_OS_WINDOWS
+// Recreates the pipe and event handles from the provided parameters,
+// signals the event, and returns a file descriptor wrapped around the pipe
+// handle. This function is called in the child process only.
+int GetStatusFileDescriptor(unsigned int parent_process_id,
+                            size_t write_handle_as_size_t,
+                            size_t event_handle_as_size_t) {
+  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
+                                                   FALSE,  // Non-inheritable.
+                                                   parent_process_id));
+  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
+    DeathTestAbort("Unable to open parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  // TODO(vladl@google.com): Replace the following check with a
+  // compile-time assertion when available.
+  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
+
+  const HANDLE write_handle =
+      reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  HANDLE dup_write_handle;
+
+  // The newly initialized handle is accessible only in in the parent
+  // process. To obtain one accessible within the child, we need to use
+  // DuplicateHandle.
+  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
+                         ::GetCurrentProcess(), &dup_write_handle,
+                         0x0,    // Requested privileges ignored since
+                                 // DUPLICATE_SAME_ACCESS is used.
+                         FALSE,  // Request non-inheritable handler.
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
+  HANDLE dup_event_handle;
+
+  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
+                         ::GetCurrentProcess(), &dup_event_handle,
+                         0x0,
+                         FALSE,
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the event handle " +
+                   StreamableToString(event_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const int write_fd =
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
+  if (write_fd == -1) {
+    DeathTestAbort("Unable to convert pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " to a file descriptor");
+  }
+
+  // Signals the parent that the write end of the pipe has been acquired
+  // so the parent can release its own write end.
+  ::SetEvent(dup_event_handle);
+
+  return write_fd;
+}
+# endif  // GTEST_OS_WINDOWS
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
+  if (GTEST_FLAG(internal_run_death_test) == "") return NULL;
+
+  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
+  // can use it here.
+  int line = -1;
+  int index = -1;
+  ::std::vector< ::std::string> fields;
+  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+  int write_fd = -1;
+
+# if GTEST_OS_WINDOWS
+
+  unsigned int parent_process_id = 0;
+  size_t write_handle_as_size_t = 0;
+  size_t event_handle_as_size_t = 0;
+
+  if (fields.size() != 6
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &parent_process_id)
+      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
+      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG(internal_run_death_test));
+  }
+  write_fd = GetStatusFileDescriptor(parent_process_id,
+                                     write_handle_as_size_t,
+                                     event_handle_as_size_t);
+# else
+
+  if (fields.size() != 4
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
+}
+
+}  // namespace internal
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace testing
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-filepath.cc b/libs/libvpx/third_party/googletest/src/src/gtest-filepath.cc
new file mode 100644
index 0000000000..0292dc1195
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-filepath.cc
@@ -0,0 +1,387 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: keith.ray@gmail.com (Keith Ray)
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-filepath.h"
+#include "gtest/internal/gtest-port.h"
+
+#include <stdlib.h>
+
+#if GTEST_OS_WINDOWS_MOBILE
+# include <windows.h>
+#elif GTEST_OS_WINDOWS
+# include <direct.h>
+# include <io.h>
+#elif GTEST_OS_SYMBIAN
+// Symbian OpenC has PATH_MAX in sys/syslimits.h
+# include <sys/syslimits.h>
+#else
+# include <limits.h>
+# include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_MAX_ _MAX_PATH
+#elif defined(PATH_MAX)
+# define GTEST_PATH_MAX_ PATH_MAX
+#elif defined(_XOPEN_PATH_MAX)
+# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#else
+# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#endif  // GTEST_OS_WINDOWS
+
+#include "gtest/internal/gtest-string.h"
+
+namespace testing {
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+// On Windows, '\\' is the standard path separator, but many tools and the
+// Windows API also accept '/' as an alternate path separator. Unless otherwise
+// noted, a file path can contain either kind of path separators, or a mixture
+// of them.
+const char kPathSeparator = '\\';
+const char kAlternatePathSeparator = '/';
+const char kAlternatePathSeparatorString[] = "/";
+# if GTEST_OS_WINDOWS_MOBILE
+// Windows CE doesn't have a current directory. You should not use
+// the current directory in tests on Windows CE, but this at least
+// provides a reasonable fallback.
+const char kCurrentDirectoryString[] = "\\";
+// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
+const DWORD kInvalidFileAttributes = 0xffffffff;
+# else
+const char kCurrentDirectoryString[] = ".\\";
+# endif  // GTEST_OS_WINDOWS_MOBILE
+#else
+const char kPathSeparator = '/';
+const char kCurrentDirectoryString[] = "./";
+#endif  // GTEST_OS_WINDOWS
+
+// Returns whether the given character is a valid path separator.
+static bool IsPathSeparator(char c) {
+#if GTEST_HAS_ALT_PATH_SEP_
+  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
+#else
+  return c == kPathSeparator;
+#endif
+}
+
+// Returns the current working directory, or "" if unsuccessful.
+FilePath FilePath::GetCurrentDir() {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+  // Windows CE doesn't have a current directory, so we just return
+  // something reasonable.
+  return FilePath(kCurrentDirectoryString);
+#elif GTEST_OS_WINDOWS
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
+#else
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  char* result = getcwd(cwd, sizeof(cwd));
+# if GTEST_OS_NACL
+  // getcwd will likely fail in NaCl due to the sandbox, so return something
+  // reasonable. The user may have provided a shim implementation for getcwd,
+  // however, so fallback only when failure is detected.
+  return FilePath(result == NULL ? kCurrentDirectoryString : cwd);
+# endif  // GTEST_OS_NACL
+  return FilePath(result == NULL ? "" : cwd);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns a copy of the FilePath with the case-insensitive extension removed.
+// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+// FilePath("dir/file"). If a case-insensitive extension is not
+// found, returns a copy of the original FilePath.
+FilePath FilePath::RemoveExtension(const char* extension) const {
+  const std::string dot_extension = std::string(".") + extension;
+  if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
+    return FilePath(pathname_.substr(
+        0, pathname_.length() - dot_extension.length()));
+  }
+  return *this;
+}
+
+// Returns a pointer to the last occurence of a valid path separator in
+// the FilePath. On Windows, for example, both '/' and '\' are valid path
+// separators. Returns NULL if no path separator was found.
+const char* FilePath::FindLastPathSeparator() const {
+  const char* const last_sep = strrchr(c_str(), kPathSeparator);
+#if GTEST_HAS_ALT_PATH_SEP_
+  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
+  // Comparing two pointers of which only one is NULL is undefined.
+  if (last_alt_sep != NULL &&
+      (last_sep == NULL || last_alt_sep > last_sep)) {
+    return last_alt_sep;
+  }
+#endif
+  return last_sep;
+}
+
+// Returns a copy of the FilePath with the directory part removed.
+// Example: FilePath("path/to/file").RemoveDirectoryName() returns
+// FilePath("file"). If there is no directory part ("just_a_file"), it returns
+// the FilePath unmodified. If there is no file part ("just_a_dir/") it
+// returns an empty FilePath ("").
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveDirectoryName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  return last_sep ? FilePath(last_sep + 1) : *this;
+}
+
+// RemoveFileName returns the directory path with the filename removed.
+// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveFileName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  std::string dir;
+  if (last_sep) {
+    dir = std::string(c_str(), last_sep + 1 - c_str());
+  } else {
+    dir = kCurrentDirectoryString;
+  }
+  return FilePath(dir);
+}
+
+// Helper functions for naming files in a directory for xml output.
+
+// Given directory = "dir", base_name = "test", number = 0,
+// extension = "xml", returns "dir/test.xml". If number is greater
+// than zero (e.g., 12), returns "dir/test_12.xml".
+// On Windows platform, uses \ as the separator rather than /.
+FilePath FilePath::MakeFileName(const FilePath& directory,
+                                const FilePath& base_name,
+                                int number,
+                                const char* extension) {
+  std::string file;
+  if (number == 0) {
+    file = base_name.string() + "." + extension;
+  } else {
+    file = base_name.string() + "_" + StreamableToString(number)
+        + "." + extension;
+  }
+  return ConcatPaths(directory, FilePath(file));
+}
+
+// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
+// On Windows, uses \ as the separator rather than /.
+FilePath FilePath::ConcatPaths(const FilePath& directory,
+                               const FilePath& relative_path) {
+  if (directory.IsEmpty())
+    return relative_path;
+  const FilePath dir(directory.RemoveTrailingPathSeparator());
+  return FilePath(dir.string() + kPathSeparator + relative_path.string());
+}
+
+// Returns true if pathname describes something findable in the file-system,
+// either a file, directory, or whatever.
+bool FilePath::FileOrDirectoryExists() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  return attributes != kInvalidFileAttributes;
+#else
+  posix::StatStruct file_stat;
+  return posix::Stat(pathname_.c_str(), &file_stat) == 0;
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns true if pathname describes a directory in the file-system
+// that exists.
+bool FilePath::DirectoryExists() const {
+  bool result = false;
+#if GTEST_OS_WINDOWS
+  // Don't strip off trailing separator if path is a root directory on
+  // Windows (like "C:\\").
+  const FilePath& path(IsRootDirectory() ? *this :
+                                           RemoveTrailingPathSeparator());
+#else
+  const FilePath& path(*this);
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  if ((attributes != kInvalidFileAttributes) &&
+      (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+    result = true;
+  }
+#else
+  posix::StatStruct file_stat;
+  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
+      posix::IsDir(file_stat);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  return result;
+}
+
+// Returns true if pathname describes a root directory. (Windows has one
+// root directory per disk drive.)
+bool FilePath::IsRootDirectory() const {
+#if GTEST_OS_WINDOWS
+  // TODO(wan@google.com): on Windows a network share like
+  // \\server\share can be a root directory, although it cannot be the
+  // current directory.  Handle this properly.
+  return pathname_.length() == 3 && IsAbsolutePath();
+#else
+  return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
+#endif
+}
+
+// Returns true if pathname describes an absolute path.
+bool FilePath::IsAbsolutePath() const {
+  const char* const name = pathname_.c_str();
+#if GTEST_OS_WINDOWS
+  return pathname_.length() >= 3 &&
+     ((name[0] >= 'a' && name[0] <= 'z') ||
+      (name[0] >= 'A' && name[0] <= 'Z')) &&
+     name[1] == ':' &&
+     IsPathSeparator(name[2]);
+#else
+  return IsPathSeparator(name[0]);
+#endif
+}
+
+// Returns a pathname for a file that does not currently exist. The pathname
+// will be directory/base_name.extension or
+// directory/base_name_<number>.extension if directory/base_name.extension
+// already exists. The number will be incremented until a pathname is found
+// that does not already exist.
+// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+// There could be a race condition if two or more processes are calling this
+// function at the same time -- they could both pick the same filename.
+FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
+                                          const FilePath& base_name,
+                                          const char* extension) {
+  FilePath full_pathname;
+  int number = 0;
+  do {
+    full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
+  } while (full_pathname.FileOrDirectoryExists());
+  return full_pathname;
+}
+
+// Returns true if FilePath ends with a path separator, which indicates that
+// it is intended to represent a directory. Returns false otherwise.
+// This does NOT check that a directory (or file) actually exists.
+bool FilePath::IsDirectory() const {
+  return !pathname_.empty() &&
+         IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
+}
+
+// Create directories so that path exists. Returns true if successful or if
+// the directories already exist; returns false if unable to create directories
+// for any reason.
+bool FilePath::CreateDirectoriesRecursively() const {
+  if (!this->IsDirectory()) {
+    return false;
+  }
+
+  if (pathname_.length() == 0 || this->DirectoryExists()) {
+    return true;
+  }
+
+  const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
+  return parent.CreateDirectoriesRecursively() && this->CreateFolder();
+}
+
+// Create the directory so that path exists. Returns true if successful or
+// if the directory already exists; returns false if unable to create the
+// directory for any reason, including if the parent directory does not
+// exist. Not named "CreateDirectory" because that's a macro on Windows.
+bool FilePath::CreateFolder() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  FilePath removed_sep(this->RemoveTrailingPathSeparator());
+  LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
+  int result = CreateDirectory(unicode, NULL) ? 0 : -1;
+  delete [] unicode;
+#elif GTEST_OS_WINDOWS
+  int result = _mkdir(pathname_.c_str());
+#else
+  int result = mkdir(pathname_.c_str(), 0777);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  if (result == -1) {
+    return this->DirectoryExists();  // An error is OK if the directory exists.
+  }
+  return true;  // No error.
+}
+
+// If input name has a trailing separator character, remove it and return the
+// name, otherwise return the name string unmodified.
+// On Windows platform, uses \ as the separator, other platforms use /.
+FilePath FilePath::RemoveTrailingPathSeparator() const {
+  return IsDirectory()
+      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+      : *this;
+}
+
+// Removes any redundant separators that might be in the pathname.
+// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+// redundancies that might be in a pathname involving "." or "..".
+// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share).
+void FilePath::Normalize() {
+  if (pathname_.c_str() == NULL) {
+    pathname_ = "";
+    return;
+  }
+  const char* src = pathname_.c_str();
+  char* const dest = new char[pathname_.length() + 1];
+  char* dest_ptr = dest;
+  memset(dest_ptr, 0, pathname_.length() + 1);
+
+  while (*src != '\0') {
+    *dest_ptr = *src;
+    if (!IsPathSeparator(*src)) {
+      src++;
+    } else {
+#if GTEST_HAS_ALT_PATH_SEP_
+      if (*dest_ptr == kAlternatePathSeparator) {
+        *dest_ptr = kPathSeparator;
+      }
+#endif
+      while (IsPathSeparator(*src))
+        src++;
+    }
+    dest_ptr++;
+  }
+  *dest_ptr = '\0';
+  pathname_ = dest;
+  delete[] dest;
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-internal-inl.h b/libs/libvpx/third_party/googletest/src/src/gtest-internal-inl.h
new file mode 100644
index 0000000000..ed8a682a96
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-internal-inl.h
@@ -0,0 +1,1183 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utility functions and classes used by the Google C++ testing framework.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// This file contains purely Google Test's internal implementation.  Please
+// DO NOT #INCLUDE IT IN A USER PROGRAM.
+
+#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
+#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+
+// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
+// part of Google Test's implementation; otherwise it's undefined.
+#if !GTEST_IMPLEMENTATION_
+// If this file is included from the user's code, just say no.
+# error "gtest-internal-inl.h is part of Google Test's internal implementation."
+# error "It must not be included except by Google Test itself."
+#endif  // GTEST_IMPLEMENTATION_
+
+#ifndef _WIN32_WCE
+# include <errno.h>
+#endif  // !_WIN32_WCE
+#include <stddef.h>
+#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
+#include <string.h>  // For memmove.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+#include "gtest/gtest.h"  // NOLINT
+#include "gtest/gtest-spi.h"
+
+namespace testing {
+
+// Declares the flags.
+//
+// We don't want the users to modify this flag in the code, but want
+// Google Test's own unit tests to be able to access it. Therefore we
+// declare it here as opposed to in gtest.h.
+GTEST_DECLARE_bool_(death_test_use_fork);
+
+namespace internal {
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
+const char kBreakOnFailureFlag[] = "break_on_failure";
+const char kCatchExceptionsFlag[] = "catch_exceptions";
+const char kColorFlag[] = "color";
+const char kFilterFlag[] = "filter";
+const char kListTestsFlag[] = "list_tests";
+const char kOutputFlag[] = "output";
+const char kPrintTimeFlag[] = "print_time";
+const char kRandomSeedFlag[] = "random_seed";
+const char kRepeatFlag[] = "repeat";
+const char kShuffleFlag[] = "shuffle";
+const char kStackTraceDepthFlag[] = "stack_trace_depth";
+const char kStreamResultToFlag[] = "stream_result_to";
+const char kThrowOnFailureFlag[] = "throw_on_failure";
+const char kFlagfileFlag[] = "flagfile";
+
+// A valid random seed must be in [1, kMaxRandomSeed].
+const int kMaxRandomSeed = 99999;
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+GTEST_API_ extern bool g_help_flag;
+
+// Returns the current time in milliseconds.
+GTEST_API_ TimeInMillis GetTimeInMillis();
+
+// Returns true iff Google Test should use colors in the output.
+GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
+
+// Formats the given time in milliseconds as seconds.
+GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
+
+// Converts the given time in milliseconds to a date string in the ISO 8601
+// format, without the timezone information.  N.B.: due to the use the
+// non-reentrant localtime() function, this function is not thread safe.  Do
+// not use it in any code that can be called from multiple threads.
+GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
+
+// Parses a string for an Int32 flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+GTEST_API_ bool ParseInt32Flag(
+    const char* str, const char* flag, Int32* value);
+
+// Returns a random seed in range [1, kMaxRandomSeed] based on the
+// given --gtest_random_seed flag value.
+inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
+  const unsigned int raw_seed = (random_seed_flag == 0) ?
+      static_cast<unsigned int>(GetTimeInMillis()) :
+      static_cast<unsigned int>(random_seed_flag);
+
+  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
+  // it's easy to type.
+  const int normalized_seed =
+      static_cast<int>((raw_seed - 1U) %
+                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+  return normalized_seed;
+}
+
+// Returns the first valid random seed after 'seed'.  The behavior is
+// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
+// considered to be 1.
+inline int GetNextRandomSeed(int seed) {
+  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
+      << "Invalid random seed " << seed << " - must be in [1, "
+      << kMaxRandomSeed << "].";
+  const int next_seed = seed + 1;
+  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
+}
+
+// This class saves the values of all Google Test flags in its c'tor, and
+// restores them in its d'tor.
+class GTestFlagSaver {
+ public:
+  // The c'tor.
+  GTestFlagSaver() {
+    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
+    color_ = GTEST_FLAG(color);
+    death_test_style_ = GTEST_FLAG(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
+    filter_ = GTEST_FLAG(filter);
+    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
+    list_tests_ = GTEST_FLAG(list_tests);
+    output_ = GTEST_FLAG(output);
+    print_time_ = GTEST_FLAG(print_time);
+    random_seed_ = GTEST_FLAG(random_seed);
+    repeat_ = GTEST_FLAG(repeat);
+    shuffle_ = GTEST_FLAG(shuffle);
+    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+  }
+
+  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
+  ~GTestFlagSaver() {
+    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
+    GTEST_FLAG(break_on_failure) = break_on_failure_;
+    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
+    GTEST_FLAG(color) = color_;
+    GTEST_FLAG(death_test_style) = death_test_style_;
+    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
+    GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
+    GTEST_FLAG(list_tests) = list_tests_;
+    GTEST_FLAG(output) = output_;
+    GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(random_seed) = random_seed_;
+    GTEST_FLAG(repeat) = repeat_;
+    GTEST_FLAG(shuffle) = shuffle_;
+    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
+    GTEST_FLAG(stream_result_to) = stream_result_to_;
+    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+  }
+
+ private:
+  // Fields for saving the original values of flags.
+  bool also_run_disabled_tests_;
+  bool break_on_failure_;
+  bool catch_exceptions_;
+  std::string color_;
+  std::string death_test_style_;
+  bool death_test_use_fork_;
+  std::string filter_;
+  std::string internal_run_death_test_;
+  bool list_tests_;
+  std::string output_;
+  bool print_time_;
+  internal::Int32 random_seed_;
+  internal::Int32 repeat_;
+  bool shuffle_;
+  internal::Int32 stack_trace_depth_;
+  std::string stream_result_to_;
+  bool throw_on_failure_;
+} GTEST_ATTRIBUTE_UNUSED_;
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded();
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (e.g., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+GTEST_API_ bool ShouldShard(const char* total_shards_str,
+                            const char* shard_index_str,
+                            bool in_subprocess_for_death_test);
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error and
+// and aborts.
+GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+GTEST_API_ bool ShouldRunTestOnShard(
+    int total_shards, int shard_index, int test_id);
+
+// STL container utilities.
+
+// Returns the number of elements in the given container that satisfy
+// the given predicate.
+template <class Container, typename Predicate>
+inline int CountIf(const Container& c, Predicate predicate) {
+  // Implemented as an explicit loop since std::count_if() in libCstd on
+  // Solaris has a non-standard signature.
+  int count = 0;
+  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it))
+      ++count;
+  }
+  return count;
+}
+
+// Applies a function/functor to each element in the container.
+template <class Container, typename Functor>
+void ForEach(const Container& c, Functor functor) {
+  std::for_each(c.begin(), c.end(), functor);
+}
+
+// Returns the i-th element of the vector, or default_value if i is not
+// in range [0, v.size()).
+template <typename E>
+inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
+  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i];
+}
+
+// Performs an in-place shuffle of a range of the vector's elements.
+// 'begin' and 'end' are element indices as an STL-style range;
+// i.e. [begin, end) are shuffled, where 'end' == size() means to
+// shuffle to the end of the vector.
+template <typename E>
+void ShuffleRange(internal::Random* random, int begin, int end,
+                  std::vector<E>* v) {
+  const int size = static_cast<int>(v->size());
+  GTEST_CHECK_(0 <= begin && begin <= size)
+      << "Invalid shuffle range start " << begin << ": must be in range [0, "
+      << size << "].";
+  GTEST_CHECK_(begin <= end && end <= size)
+      << "Invalid shuffle range finish " << end << ": must be in range ["
+      << begin << ", " << size << "].";
+
+  // Fisher-Yates shuffle, from
+  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+  for (int range_width = end - begin; range_width >= 2; range_width--) {
+    const int last_in_range = begin + range_width - 1;
+    const int selected = begin + random->Generate(range_width);
+    std::swap((*v)[selected], (*v)[last_in_range]);
+  }
+}
+
+// Performs an in-place shuffle of the vector's elements.
+template <typename E>
+inline void Shuffle(internal::Random* random, std::vector<E>* v) {
+  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
+}
+
+// A function for deleting an object.  Handy for being used as a
+// functor.
+template <typename T>
+static void Delete(T* x) {
+  delete x;
+}
+
+// A predicate that checks the key of a TestProperty against a known key.
+//
+// TestPropertyKeyIs is copyable.
+class TestPropertyKeyIs {
+ public:
+  // Constructor.
+  //
+  // TestPropertyKeyIs has NO default constructor.
+  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
+
+  // Returns true iff the test name of test property matches on key_.
+  bool operator()(const TestProperty& test_property) const {
+    return test_property.key() == key_;
+  }
+
+ private:
+  std::string key_;
+};
+
+// Class UnitTestOptions.
+//
+// This class contains functions for processing options the user
+// specifies when running the tests.  It has only static members.
+//
+// In most cases, the user can specify an option using either an
+// environment variable or a command line flag.  E.g. you can set the
+// test filter using either GTEST_FILTER or --gtest_filter.  If both
+// the variable and the flag are present, the latter overrides the
+// former.
+class GTEST_API_ UnitTestOptions {
+ public:
+  // Functions for processing the gtest_output flag.
+
+  // Returns the output format, or "" for normal printed output.
+  static std::string GetOutputFormat();
+
+  // Returns the absolute path of the requested output file, or the
+  // default (test_detail.xml in the original working directory) if
+  // none was explicitly specified.
+  static std::string GetAbsolutePathToOutputFile();
+
+  // Functions for processing the gtest_filter flag.
+
+  // Returns true iff the wildcard pattern matches the string.  The
+  // first ':' or '\0' character in pattern marks the end of it.
+  //
+  // This recursive algorithm isn't very efficient, but is clear and
+  // works well enough for matching test names, which are short.
+  static bool PatternMatchesString(const char *pattern, const char *str);
+
+  // Returns true iff the user-specified filter matches the test case
+  // name and the test name.
+  static bool FilterMatchesTest(const std::string &test_case_name,
+                                const std::string &test_name);
+
+#if GTEST_OS_WINDOWS
+  // Function for supporting the gtest_catch_exception flag.
+
+  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+  // This function is useful as an __except condition.
+  static int GTestShouldProcessSEH(DWORD exception_code);
+#endif  // GTEST_OS_WINDOWS
+
+  // Returns true if "name" matches the ':' separated list of glob-style
+  // filters in "filter".
+  static bool MatchesFilter(const std::string& name, const char* filter);
+};
+
+// Returns the current application's name, removing directory path if that
+// is present.  Used by UnitTestOptions::GetOutputFile.
+GTEST_API_ FilePath GetCurrentExecutableName();
+
+// The role interface for getting the OS stack trace as a string.
+class OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetterInterface() {}
+  virtual ~OsStackTraceGetterInterface() {}
+
+  // Returns the current OS stack trace as an std::string.  Parameters:
+  //
+  //   max_depth  - the maximum number of stack frames to be included
+  //                in the trace.
+  //   skip_count - the number of top frames to be skipped; doesn't count
+  //                against max_depth.
+  virtual string CurrentStackTrace(int max_depth, int skip_count) = 0;
+
+  // UponLeavingGTest() should be called immediately before Google Test calls
+  // user code. It saves some information about the current stack that
+  // CurrentStackTrace() will use to find and hide Google Test stack frames.
+  virtual void UponLeavingGTest() = 0;
+
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char* const kElidedFramesMarker;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+};
+
+// A working implementation of the OsStackTraceGetterInterface interface.
+class OsStackTraceGetter : public OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetter() {}
+
+  virtual string CurrentStackTrace(int max_depth, int skip_count);
+  virtual void UponLeavingGTest();
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+};
+
+// Information about a Google Test trace point.
+struct TraceInfo {
+  const char* file;
+  int line;
+  std::string message;
+};
+
+// This is the default global test part result reporter used in UnitTestImpl.
+// This class should only be used by UnitTestImpl.
+class DefaultGlobalTestPartResultReporter
+  : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. Reports the test part
+  // result in the current test.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+};
+
+// This is the default per thread test part result reporter used in
+// UnitTestImpl. This class should only be used by UnitTestImpl.
+class DefaultPerThreadTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. The implementation just
+  // delegates to the current global test part result reporter of *unit_test_.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+};
+
+// The private implementation of the UnitTest class.  We don't protect
+// the methods under a mutex, as this class is not accessible by a
+// user and the UnitTest class that delegates work to this class does
+// proper locking.
+class GTEST_API_ UnitTestImpl {
+ public:
+  explicit UnitTestImpl(UnitTest* parent);
+  virtual ~UnitTestImpl();
+
+  // There are two different ways to register your own TestPartResultReporter.
+  // You can register your own repoter to listen either only for test results
+  // from the current thread or for results from all threads.
+  // By default, each per-thread test result repoter just passes a new
+  // TestPartResult to the global test result reporter, which registers the
+  // test part result for the currently running test.
+
+  // Returns the global test part result reporter.
+  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
+
+  // Sets the global test part result reporter.
+  void SetGlobalTestPartResultReporter(
+      TestPartResultReporterInterface* reporter);
+
+  // Returns the test part result reporter for the current thread.
+  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
+
+  // Sets the test part result reporter for the current thread.
+  void SetTestPartResultReporterForCurrentThread(
+      TestPartResultReporterInterface* reporter);
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const {
+    return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed();
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[i];
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i) {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[index];
+  }
+
+  // Provides access to the event listener list.
+  TestEventListeners* listeners() { return &listeners_; }
+
+  // Returns the TestResult for the test that's currently running, or
+  // the TestResult for the ad hoc test if no test is running.
+  TestResult* current_test_result();
+
+  // Returns the TestResult for the ad hoc test.
+  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+
+  // Sets the OS stack trace getter.
+  //
+  // Does nothing if the input and the current OS stack trace getter
+  // are the same; otherwise, deletes the old getter and makes the
+  // input the current getter.
+  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
+
+  // Returns the current OS stack trace getter if it is not NULL;
+  // otherwise, creates an OsStackTraceGetter, makes it the current
+  // getter, and returns it.
+  OsStackTraceGetterInterface* os_stack_trace_getter();
+
+  // Returns the current OS stack trace as an std::string.
+  //
+  // The maximum number of stack frames to be included is specified by
+  // the gtest_stack_trace_depth flag.  The skip_count parameter
+  // specifies the number of top frames to be skipped, which doesn't
+  // count against the number of frames to be included.
+  //
+  // For example, if Foo() calls Bar(), which in turn calls
+  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+
+  // Finds and returns a TestCase with the given name.  If one doesn't
+  // exist, creates one and returns it.
+  //
+  // Arguments:
+  //
+  //   test_case_name: name of the test case
+  //   type_param:     the name of the test's type parameter, or NULL if
+  //                   this is not a typed or a type-parameterized test.
+  //   set_up_tc:      pointer to the function that sets up the test case
+  //   tear_down_tc:   pointer to the function that tears down the test case
+  TestCase* GetTestCase(const char* test_case_name,
+                        const char* type_param,
+                        Test::SetUpTestCaseFunc set_up_tc,
+                        Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Adds a TestInfo to the unit test.
+  //
+  // Arguments:
+  //
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  //   test_info:    the TestInfo object
+  void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc,
+                   TestInfo* test_info) {
+    // In order to support thread-safe death tests, we need to
+    // remember the original working directory when the test program
+    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
+    // the user may have changed the current directory before calling
+    // RUN_ALL_TESTS().  Therefore we capture the current directory in
+    // AddTestInfo(), which is called to register a TEST or TEST_F
+    // before main() is reached.
+    if (original_working_dir_.IsEmpty()) {
+      original_working_dir_.Set(FilePath::GetCurrentDir());
+      GTEST_CHECK_(!original_working_dir_.IsEmpty())
+          << "Failed to get the current working directory.";
+    }
+
+    GetTestCase(test_info->test_case_name(),
+                test_info->type_param(),
+                set_up_tc,
+                tear_down_tc)->AddTestInfo(test_info);
+  }
+
+#if GTEST_HAS_PARAM_TEST
+  // Returns ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
+    return parameterized_test_registry_;
+  }
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Sets the TestCase object for the test that's currently running.
+  void set_current_test_case(TestCase* a_current_test_case) {
+    current_test_case_ = a_current_test_case;
+  }
+
+  // Sets the TestInfo object for the test that's currently running.  If
+  // current_test_info is NULL, the assertion results will be stored in
+  // ad_hoc_test_result_.
+  void set_current_test_info(TestInfo* a_current_test_info) {
+    current_test_info_ = a_current_test_info;
+  }
+
+  // Registers all parameterized tests defined using TEST_P and
+  // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter
+  // combination. This method can be called more then once; it has guards
+  // protecting from registering the tests more then once.  If
+  // value-parameterized tests are disabled, RegisterParameterizedTests is
+  // present but does nothing.
+  void RegisterParameterizedTests();
+
+  // Runs all tests in this UnitTest object, prints the result, and
+  // returns true if all tests are successful.  If any exception is
+  // thrown during a test, this test is considered to be failed, but
+  // the rest of the tests will still be run.
+  bool RunAllTests();
+
+  // Clears the results of all tests, except the ad hoc tests.
+  void ClearNonAdHocTestResult() {
+    ForEach(test_cases_, TestCase::ClearTestCaseResult);
+  }
+
+  // Clears the results of ad-hoc test assertions.
+  void ClearAdHocTestResult() {
+    ad_hoc_test_result_.Clear();
+  }
+
+  // Adds a TestProperty to the current TestResult object when invoked in a
+  // context of a test or a test case, or to the global property set. If the
+  // result already contains a property with the same key, the value will be
+  // updated.
+  void RecordProperty(const TestProperty& test_property);
+
+  enum ReactionToSharding {
+    HONOR_SHARDING_PROTOCOL,
+    IGNORE_SHARDING_PROTOCOL
+  };
+
+  // Matches the full name of each test against the user-specified
+  // filter to decide whether the test should run, then records the
+  // result in each TestCase and TestInfo object.
+  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
+  // based on sharding variables in the environment.
+  // Returns the number of tests that should run.
+  int FilterTests(ReactionToSharding shard_tests);
+
+  // Prints the names of the tests matching the user-specified filter flag.
+  void ListTestsMatchingFilter();
+
+  const TestCase* current_test_case() const { return current_test_case_; }
+  TestInfo* current_test_info() { return current_test_info_; }
+  const TestInfo* current_test_info() const { return current_test_info_; }
+
+  // Returns the vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*>& environments() { return environments_; }
+
+  // Getters for the per-thread Google Test trace stack.
+  std::vector<TraceInfo>& gtest_trace_stack() {
+    return *(gtest_trace_stack_.pointer());
+  }
+  const std::vector<TraceInfo>& gtest_trace_stack() const {
+    return gtest_trace_stack_.get();
+  }
+
+#if GTEST_HAS_DEATH_TEST
+  void InitDeathTestSubprocessControlInfo() {
+    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
+  }
+  // Returns a pointer to the parsed --gtest_internal_run_death_test
+  // flag, or NULL if that flag was not specified.
+  // This information is useful only in a death test child process.
+  // Must not be called before a call to InitGoogleTest.
+  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
+    return internal_run_death_test_flag_.get();
+  }
+
+  // Returns a pointer to the current death test factory.
+  internal::DeathTestFactory* death_test_factory() {
+    return death_test_factory_.get();
+  }
+
+  void SuppressTestEventsIfInSubprocess();
+
+  friend class ReplaceDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // Initializes the event listener performing XML output as specified by
+  // UnitTestOptions. Must not be called before InitGoogleTest.
+  void ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+  // Initializes the event listener for streaming test results to a socket.
+  // Must not be called before InitGoogleTest.
+  void ConfigureStreamingOutput();
+#endif
+
+  // Performs initialization dependent upon flag values obtained in
+  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+  // this function is also called from RunAllTests.  Since this function can be
+  // called more than once, it has to be idempotent.
+  void PostFlagParsingInit();
+
+  // Gets the random seed used at the start of the current test iteration.
+  int random_seed() const { return random_seed_; }
+
+  // Gets the random number generator.
+  internal::Random* random() { return &random_; }
+
+  // Shuffles all test cases, and the tests within each test case,
+  // making sure that death tests are still run first.
+  void ShuffleTests();
+
+  // Restores the test cases and tests to their order before the first shuffle.
+  void UnshuffleTests();
+
+  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
+  // UnitTest::Run() starts.
+  bool catch_exceptions() const { return catch_exceptions_; }
+
+ private:
+  friend class ::testing::UnitTest;
+
+  // Used by UnitTest::Run() to capture the state of
+  // GTEST_FLAG(catch_exceptions) at the moment it starts.
+  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
+
+  // The UnitTest object that owns this implementation object.
+  UnitTest* const parent_;
+
+  // The working directory when the first TEST() or TEST_F() was
+  // executed.
+  internal::FilePath original_working_dir_;
+
+  // The default test part result reporters.
+  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
+  DefaultPerThreadTestPartResultReporter
+      default_per_thread_test_part_result_reporter_;
+
+  // Points to (but doesn't own) the global test part result reporter.
+  TestPartResultReporterInterface* global_test_part_result_repoter_;
+
+  // Protects read and write access to global_test_part_result_reporter_.
+  internal::Mutex global_test_part_result_reporter_mutex_;
+
+  // Points to (but doesn't own) the per-thread test part result reporter.
+  internal::ThreadLocal<TestPartResultReporterInterface*>
+      per_thread_test_part_result_reporter_;
+
+  // The vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*> environments_;
+
+  // The vector of TestCases in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestCase*> test_cases_;
+
+  // Provides a level of indirection for the test case list to allow
+  // easy shuffling and restoring the test case order.  The i-th
+  // element of this vector is the index of the i-th test case in the
+  // shuffled order.
+  std::vector<int> test_case_indices_;
+
+#if GTEST_HAS_PARAM_TEST
+  // ParameterizedTestRegistry object used to register value-parameterized
+  // tests.
+  internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
+
+  // Indicates whether RegisterParameterizedTests() has been called already.
+  bool parameterized_tests_registered_;
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Index of the last death test case registered.  Initially -1.
+  int last_death_test_case_;
+
+  // This points to the TestCase for the currently running test.  It
+  // changes as Google Test goes through one test case after another.
+  // When no test is running, this is set to NULL and Google Test
+  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestCase* current_test_case_;
+
+  // This points to the TestInfo for the currently running test.  It
+  // changes as Google Test goes through one test after another.  When
+  // no test is running, this is set to NULL and Google Test stores
+  // assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestInfo* current_test_info_;
+
+  // Normally, a user only writes assertions inside a TEST or TEST_F,
+  // or inside a function called by a TEST or TEST_F.  Since Google
+  // Test keeps track of which test is current running, it can
+  // associate such an assertion with the test it belongs to.
+  //
+  // If an assertion is encountered when no TEST or TEST_F is running,
+  // Google Test attributes the assertion result to an imaginary "ad hoc"
+  // test, and records the result in ad_hoc_test_result_.
+  TestResult ad_hoc_test_result_;
+
+  // The list of event listeners that can be used to track events inside
+  // Google Test.
+  TestEventListeners listeners_;
+
+  // The OS stack trace getter.  Will be deleted when the UnitTest
+  // object is destructed.  By default, an OsStackTraceGetter is used,
+  // but the user can set this field to use a custom getter if that is
+  // desired.
+  OsStackTraceGetterInterface* os_stack_trace_getter_;
+
+  // True iff PostFlagParsingInit() has been called.
+  bool post_flag_parse_init_performed_;
+
+  // The random number seed used at the beginning of the test run.
+  int random_seed_;
+
+  // Our random number generator.
+  internal::Random random_;
+
+  // The time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp_;
+
+  // How long the test took to run, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+#if GTEST_HAS_DEATH_TEST
+  // The decomposed components of the gtest_internal_run_death_test flag,
+  // parsed when RUN_ALL_TESTS is called.
+  internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
+  internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
+  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
+
+  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
+  // starts.
+  bool catch_exceptions_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+};  // class UnitTestImpl
+
+// Convenience function for accessing the global UnitTest
+// implementation object.
+inline UnitTestImpl* GetUnitTestImpl() {
+  return UnitTest::GetInstance()->impl();
+}
+
+#if GTEST_USES_SIMPLE_RE
+
+// Internal helper functions for implementing the simple regular
+// expression matcher.
+GTEST_API_ bool IsInSet(char ch, const char* str);
+GTEST_API_ bool IsAsciiDigit(char ch);
+GTEST_API_ bool IsAsciiPunct(char ch);
+GTEST_API_ bool IsRepeat(char ch);
+GTEST_API_ bool IsAsciiWhiteSpace(char ch);
+GTEST_API_ bool IsAsciiWordChar(char ch);
+GTEST_API_ bool IsValidEscape(char ch);
+GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
+GTEST_API_ bool ValidateRegex(const char* regex);
+GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
+
+#endif  // GTEST_USES_SIMPLE_RE
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
+
+#if GTEST_HAS_DEATH_TEST
+
+// Returns the message describing the last system error, regardless of the
+// platform.
+GTEST_API_ std::string GetLastErrnoDescription();
+
+// Attempts to parse a string into a positive integer pointed to by the
+// number parameter.  Returns true if that is possible.
+// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
+// it here.
+template <typename Integer>
+bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
+  // Fail fast if the given string does not begin with a digit;
+  // this bypasses strtoXXX's "optional leading whitespace and plus
+  // or minus sign" semantics, which are undesirable here.
+  if (str.empty() || !IsDigit(str[0])) {
+    return false;
+  }
+  errno = 0;
+
+  char* end;
+  // BiggestConvertible is the largest integer type that system-provided
+  // string-to-number conversion routines can return.
+
+# if GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  // MSVC and C++ Builder define __int64 instead of the standard long long.
+  typedef unsigned __int64 BiggestConvertible;
+  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
+
+# else
+
+  typedef unsigned long long BiggestConvertible;  // NOLINT
+  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
+
+# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  const bool parse_success = *end == '\0' && errno == 0;
+
+  // TODO(vladl@google.com): Convert this to compile time assertion when it is
+  // available.
+  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
+
+  const Integer result = static_cast<Integer>(parsed);
+  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
+    *number = result;
+    return true;
+  }
+  return false;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// TestResult contains some private methods that should be hidden from
+// Google Test user but are required for testing. This class allow our tests
+// to access them.
+//
+// This class is supplied only for the purpose of testing Google Test's own
+// constructs. Do not use it in user tests, either directly or indirectly.
+class TestResultAccessor {
+ public:
+  static void RecordProperty(TestResult* test_result,
+                             const std::string& xml_element,
+                             const TestProperty& property) {
+    test_result->RecordProperty(xml_element, property);
+  }
+
+  static void ClearTestPartResults(TestResult* test_result) {
+    test_result->ClearTestPartResults();
+  }
+
+  static const std::vector<testing::TestPartResult>& test_part_results(
+      const TestResult& test_result) {
+    return test_result.test_part_results();
+  }
+};
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Streams test results to the given port on the given host machine.
+class GTEST_API_ StreamingListener : public EmptyTestEventListener {
+ public:
+  // Abstract base class for writing strings to a socket.
+  class AbstractSocketWriter {
+   public:
+    virtual ~AbstractSocketWriter() {}
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) = 0;
+
+    // Closes the socket.
+    virtual void CloseConnection() {}
+
+    // Sends a string and a newline to the socket.
+    void SendLn(const string& message) {
+      Send(message + "\n");
+    }
+  };
+
+  // Concrete class for actually writing strings to a socket.
+  class SocketWriter : public AbstractSocketWriter {
+   public:
+    SocketWriter(const string& host, const string& port)
+        : sockfd_(-1), host_name_(host), port_num_(port) {
+      MakeConnection();
+    }
+
+    virtual ~SocketWriter() {
+      if (sockfd_ != -1)
+        CloseConnection();
+    }
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "Send() can be called only when there is a connection.";
+
+      const int len = static_cast<int>(message.length());
+      if (write(sockfd_, message.c_str(), len) != len) {
+        GTEST_LOG_(WARNING)
+            << "stream_result_to: failed to stream to "
+            << host_name_ << ":" << port_num_;
+      }
+    }
+
+   private:
+    // Creates a client socket and connects to the server.
+    void MakeConnection();
+
+    // Closes the socket.
+    void CloseConnection() {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "CloseConnection() can be called only when there is a connection.";
+
+      close(sockfd_);
+      sockfd_ = -1;
+    }
+
+    int sockfd_;  // socket file descriptor
+    const string host_name_;
+    const string port_num_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+  };  // class SocketWriter
+
+  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
+  static string UrlEncode(const char* str);
+
+  StreamingListener(const string& host, const string& port)
+      : socket_writer_(new SocketWriter(host, port)) { Start(); }
+
+  explicit StreamingListener(AbstractSocketWriter* socket_writer)
+      : socket_writer_(socket_writer) { Start(); }
+
+  void OnTestProgramStart(const UnitTest& /* unit_test */) {
+    SendLn("event=TestProgramStart");
+  }
+
+  void OnTestProgramEnd(const UnitTest& unit_test) {
+    // Note that Google Test current only report elapsed time for each
+    // test iteration, not for the entire test program.
+    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
+
+    // Notify the streaming server to stop.
+    socket_writer_->CloseConnection();
+  }
+
+  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
+    SendLn("event=TestIterationStart&iteration=" +
+           StreamableToString(iteration));
+  }
+
+  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
+    SendLn("event=TestIterationEnd&passed=" +
+           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
+           StreamableToString(unit_test.elapsed_time()) + "ms");
+  }
+
+  void OnTestCaseStart(const TestCase& test_case) {
+    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  }
+
+  void OnTestCaseEnd(const TestCase& test_case) {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed())
+           + "&elapsed_time=" + StreamableToString(test_case.elapsed_time())
+           + "ms");
+  }
+
+  void OnTestStart(const TestInfo& test_info) {
+    SendLn(std::string("event=TestStart&name=") + test_info.name());
+  }
+
+  void OnTestEnd(const TestInfo& test_info) {
+    SendLn("event=TestEnd&passed=" +
+           FormatBool((test_info.result())->Passed()) +
+           "&elapsed_time=" +
+           StreamableToString((test_info.result())->elapsed_time()) + "ms");
+  }
+
+  void OnTestPartResult(const TestPartResult& test_part_result) {
+    const char* file_name = test_part_result.file_name();
+    if (file_name == NULL)
+      file_name = "";
+    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
+           "&line=" + StreamableToString(test_part_result.line_number()) +
+           "&message=" + UrlEncode(test_part_result.message()));
+  }
+
+ private:
+  // Sends the given message and a newline to the socket.
+  void SendLn(const string& message) { socket_writer_->SendLn(message); }
+
+  // Called at the start of streaming to notify the receiver what
+  // protocol we are using.
+  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
+
+  string FormatBool(bool value) { return value ? "1" : "0"; }
+
+  const scoped_ptr<AbstractSocketWriter> socket_writer_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+};  // class StreamingListener
+
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-port.cc b/libs/libvpx/third_party/googletest/src/src/gtest-port.cc
new file mode 100644
index 0000000000..e5bf3dd2be
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-port.cc
@@ -0,0 +1,1259 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+#include "gtest/internal/gtest-port.h"
+
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <fstream>
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>
+# include <io.h>
+# include <sys/stat.h>
+# include <map>  // Used in ThreadLocal.
+#else
+# include <unistd.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_MAC
+# include <mach/mach_init.h>
+# include <mach/task.h>
+# include <mach/vm_map.h>
+#endif  // GTEST_OS_MAC
+
+#if GTEST_OS_QNX
+# include <devctl.h>
+# include <fcntl.h>
+# include <sys/procfs.h>
+#endif  // GTEST_OS_QNX
+
+#if GTEST_OS_AIX
+# include <procinfo.h>
+# include <sys/types.h>
+#endif  // GTEST_OS_AIX
+
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick exists to
+// prevent the accidental inclusion of gtest-internal-inl.h in the
+// user's code.
+#define GTEST_IMPLEMENTATION_ 1
+#include "src/gtest-internal-inl.h"
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+namespace internal {
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // _MSC_VER
+
+#if GTEST_OS_LINUX
+
+namespace {
+template <typename T>
+T ReadProcFileField(const string& filename, int field) {
+  std::string dummy;
+  std::ifstream file(filename.c_str());
+  while (field-- > 0) {
+    file >> dummy;
+  }
+  T output = 0;
+  file >> output;
+  return output;
+}
+}  // namespace
+
+// Returns the number of active threads, or 0 when there is an error.
+size_t GetThreadCount() {
+  const string filename =
+      (Message() << "/proc/" << getpid() << "/stat").GetString();
+  return ReadProcFileField<int>(filename, 19);
+}
+
+#elif GTEST_OS_MAC
+
+size_t GetThreadCount() {
+  const task_t task = mach_task_self();
+  mach_msg_type_number_t thread_count;
+  thread_act_array_t thread_list;
+  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
+  if (status == KERN_SUCCESS) {
+    // task_threads allocates resources in thread_list and we need to free them
+    // to avoid leaks.
+    vm_deallocate(task,
+                  reinterpret_cast<vm_address_t>(thread_list),
+                  sizeof(thread_t) * thread_count);
+    return static_cast<size_t>(thread_count);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_QNX
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const int fd = open("/proc/self/as", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+  procfs_info process_info;
+  const int status =
+      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL);
+  close(fd);
+  if (status == EOK) {
+    return static_cast<size_t>(process_info.num_threads);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_AIX
+
+size_t GetThreadCount() {
+  struct procentry64 entry;
+  pid_t pid = getpid();
+  int status = getprocs64(&entry, sizeof(entry), NULL, 0, &pid, 1);
+  if (status == 1) {
+    return entry.pi_thcount;
+  } else {
+    return 0;
+  }
+}
+
+#else
+
+size_t GetThreadCount() {
+  // There's no portable way to detect the number of threads, so we just
+  // return 0 to indicate that we cannot detect it.
+  return 0;
+}
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+void SleepMilliseconds(int n) {
+  ::Sleep(n);
+}
+
+AutoHandle::AutoHandle()
+    : handle_(INVALID_HANDLE_VALUE) {}
+
+AutoHandle::AutoHandle(Handle handle)
+    : handle_(handle) {}
+
+AutoHandle::~AutoHandle() {
+  Reset();
+}
+
+AutoHandle::Handle AutoHandle::Get() const {
+  return handle_;
+}
+
+void AutoHandle::Reset() {
+  Reset(INVALID_HANDLE_VALUE);
+}
+
+void AutoHandle::Reset(HANDLE handle) {
+  // Resetting with the same handle we already own is invalid.
+  if (handle_ != handle) {
+    if (IsCloseable()) {
+      ::CloseHandle(handle_);
+    }
+    handle_ = handle;
+  } else {
+    GTEST_CHECK_(!IsCloseable())
+        << "Resetting a valid handle to itself is likely a programmer error "
+            "and thus not allowed.";
+  }
+}
+
+bool AutoHandle::IsCloseable() const {
+  // Different Windows APIs may use either of these values to represent an
+  // invalid handle.
+  return handle_ != NULL && handle_ != INVALID_HANDLE_VALUE;
+}
+
+Notification::Notification()
+    : event_(::CreateEvent(NULL,   // Default security attributes.
+                           TRUE,   // Do not reset automatically.
+                           FALSE,  // Initially unset.
+                           NULL)) {  // Anonymous event.
+  GTEST_CHECK_(event_.Get() != NULL);
+}
+
+void Notification::Notify() {
+  GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE);
+}
+
+void Notification::WaitForNotification() {
+  GTEST_CHECK_(
+      ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
+}
+
+Mutex::Mutex()
+    : owner_thread_id_(0),
+      type_(kDynamic),
+      critical_section_init_phase_(0),
+      critical_section_(new CRITICAL_SECTION) {
+  ::InitializeCriticalSection(critical_section_);
+}
+
+Mutex::~Mutex() {
+  // Static mutexes are leaked intentionally. It is not thread-safe to try
+  // to clean them up.
+  // TODO(yukawa): Switch to Slim Reader/Writer (SRW) Locks, which requires
+  // nothing to clean it up but is available only on Vista and later.
+  // http://msdn.microsoft.com/en-us/library/windows/desktop/aa904937.aspx
+  if (type_ == kDynamic) {
+    ::DeleteCriticalSection(critical_section_);
+    delete critical_section_;
+    critical_section_ = NULL;
+  }
+}
+
+void Mutex::Lock() {
+  ThreadSafeLazyInit();
+  ::EnterCriticalSection(critical_section_);
+  owner_thread_id_ = ::GetCurrentThreadId();
+}
+
+void Mutex::Unlock() {
+  ThreadSafeLazyInit();
+  // We don't protect writing to owner_thread_id_ here, as it's the
+  // caller's responsibility to ensure that the current thread holds the
+  // mutex when this is called.
+  owner_thread_id_ = 0;
+  ::LeaveCriticalSection(critical_section_);
+}
+
+// Does nothing if the current thread holds the mutex. Otherwise, crashes
+// with high probability.
+void Mutex::AssertHeld() {
+  ThreadSafeLazyInit();
+  GTEST_CHECK_(owner_thread_id_ == ::GetCurrentThreadId())
+      << "The current thread is not holding the mutex @" << this;
+}
+
+// Initializes owner_thread_id_ and critical_section_ in static mutexes.
+void Mutex::ThreadSafeLazyInit() {
+  // Dynamic mutexes are initialized in the constructor.
+  if (type_ == kStatic) {
+    switch (
+        ::InterlockedCompareExchange(&critical_section_init_phase_, 1L, 0L)) {
+      case 0:
+        // If critical_section_init_phase_ was 0 before the exchange, we
+        // are the first to test it and need to perform the initialization.
+        owner_thread_id_ = 0;
+        critical_section_ = new CRITICAL_SECTION;
+        ::InitializeCriticalSection(critical_section_);
+        // Updates the critical_section_init_phase_ to 2 to signal
+        // initialization complete.
+        GTEST_CHECK_(::InterlockedCompareExchange(
+                          &critical_section_init_phase_, 2L, 1L) ==
+                      1L);
+        break;
+      case 1:
+        // Somebody else is already initializing the mutex; spin until they
+        // are done.
+        while (::InterlockedCompareExchange(&critical_section_init_phase_,
+                                            2L,
+                                            2L) != 2L) {
+          // Possibly yields the rest of the thread's time slice to other
+          // threads.
+          ::Sleep(0);
+        }
+        break;
+
+      case 2:
+        break;  // The mutex is already initialized and ready for use.
+
+      default:
+        GTEST_CHECK_(false)
+            << "Unexpected value of critical_section_init_phase_ "
+            << "while initializing a static mutex.";
+    }
+  }
+}
+
+namespace {
+
+class ThreadWithParamSupport : public ThreadWithParamBase {
+ public:
+  static HANDLE CreateThread(Runnable* runnable,
+                             Notification* thread_can_start) {
+    ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start);
+    DWORD thread_id;
+    // TODO(yukawa): Consider to use _beginthreadex instead.
+    HANDLE thread_handle = ::CreateThread(
+        NULL,    // Default security.
+        0,       // Default stack size.
+        &ThreadWithParamSupport::ThreadMain,
+        param,   // Parameter to ThreadMainStatic
+        0x0,     // Default creation flags.
+        &thread_id);  // Need a valid pointer for the call to work under Win98.
+    GTEST_CHECK_(thread_handle != NULL) << "CreateThread failed with error "
+                                        << ::GetLastError() << ".";
+    if (thread_handle == NULL) {
+      delete param;
+    }
+    return thread_handle;
+  }
+
+ private:
+  struct ThreadMainParam {
+    ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
+        : runnable_(runnable),
+          thread_can_start_(thread_can_start) {
+    }
+    scoped_ptr<Runnable> runnable_;
+    // Does not own.
+    Notification* thread_can_start_;
+  };
+
+  static DWORD WINAPI ThreadMain(void* ptr) {
+    // Transfers ownership.
+    scoped_ptr<ThreadMainParam> param(static_cast<ThreadMainParam*>(ptr));
+    if (param->thread_can_start_ != NULL)
+      param->thread_can_start_->WaitForNotification();
+    param->runnable_->Run();
+    return 0;
+  }
+
+  // Prohibit instantiation.
+  ThreadWithParamSupport();
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport);
+};
+
+}  // namespace
+
+ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
+                                         Notification* thread_can_start)
+      : thread_(ThreadWithParamSupport::CreateThread(runnable,
+                                                     thread_can_start)) {
+}
+
+ThreadWithParamBase::~ThreadWithParamBase() {
+  Join();
+}
+
+void ThreadWithParamBase::Join() {
+  GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
+      << "Failed to join the thread with error " << ::GetLastError() << ".";
+}
+
+// Maps a thread to a set of ThreadIdToThreadLocals that have values
+// instantiated on that thread and notifies them when the thread exits.  A
+// ThreadLocal instance is expected to persist until all threads it has
+// values on have terminated.
+class ThreadLocalRegistryImpl {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
+    DWORD current_thread = ::GetCurrentThreadId();
+    MutexLock lock(&mutex_);
+    ThreadIdToThreadLocals* const thread_to_thread_locals =
+        GetThreadLocalsMapLocked();
+    ThreadIdToThreadLocals::iterator thread_local_pos =
+        thread_to_thread_locals->find(current_thread);
+    if (thread_local_pos == thread_to_thread_locals->end()) {
+      thread_local_pos = thread_to_thread_locals->insert(
+          std::make_pair(current_thread, ThreadLocalValues())).first;
+      StartWatcherThreadFor(current_thread);
+    }
+    ThreadLocalValues& thread_local_values = thread_local_pos->second;
+    ThreadLocalValues::iterator value_pos =
+        thread_local_values.find(thread_local_instance);
+    if (value_pos == thread_local_values.end()) {
+      value_pos =
+          thread_local_values
+              .insert(std::make_pair(
+                  thread_local_instance,
+                  linked_ptr<ThreadLocalValueHolderBase>(
+                      thread_local_instance->NewValueForCurrentThread())))
+              .first;
+    }
+    return value_pos->second.get();
+  }
+
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance) {
+    std::vector<linked_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadLocalValues data structure while holding the lock, but
+    // defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      for (ThreadIdToThreadLocals::iterator it =
+          thread_to_thread_locals->begin();
+          it != thread_to_thread_locals->end();
+          ++it) {
+        ThreadLocalValues& thread_local_values = it->second;
+        ThreadLocalValues::iterator value_pos =
+            thread_local_values.find(thread_local_instance);
+        if (value_pos != thread_local_values.end()) {
+          value_holders.push_back(value_pos->second);
+          thread_local_values.erase(value_pos);
+          // This 'if' can only be successful at most once, so theoretically we
+          // could break out of the loop here, but we don't bother doing so.
+        }
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+  static void OnThreadExit(DWORD thread_id) {
+    GTEST_CHECK_(thread_id != 0) << ::GetLastError();
+    std::vector<linked_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadIdToThreadLocals data structure while holding the
+    // lock, but defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      ThreadIdToThreadLocals::iterator thread_local_pos =
+          thread_to_thread_locals->find(thread_id);
+      if (thread_local_pos != thread_to_thread_locals->end()) {
+        ThreadLocalValues& thread_local_values = thread_local_pos->second;
+        for (ThreadLocalValues::iterator value_pos =
+            thread_local_values.begin();
+            value_pos != thread_local_values.end();
+            ++value_pos) {
+          value_holders.push_back(value_pos->second);
+        }
+        thread_to_thread_locals->erase(thread_local_pos);
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+ private:
+  // In a particular thread, maps a ThreadLocal object to its value.
+  typedef std::map<const ThreadLocalBase*,
+                   linked_ptr<ThreadLocalValueHolderBase> > ThreadLocalValues;
+  // Stores all ThreadIdToThreadLocals having values in a thread, indexed by
+  // thread's ID.
+  typedef std::map<DWORD, ThreadLocalValues> ThreadIdToThreadLocals;
+
+  // Holds the thread id and thread handle that we pass from
+  // StartWatcherThreadFor to WatcherThreadFunc.
+  typedef std::pair<DWORD, HANDLE> ThreadIdAndHandle;
+
+  static void StartWatcherThreadFor(DWORD thread_id) {
+    // The returned handle will be kept in thread_map and closed by
+    // watcher_thread in WatcherThreadFunc.
+    HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION,
+                                 FALSE,
+                                 thread_id);
+    GTEST_CHECK_(thread != NULL);
+    // We need to to pass a valid thread ID pointer into CreateThread for it
+    // to work correctly under Win98.
+    DWORD watcher_thread_id;
+    HANDLE watcher_thread = ::CreateThread(
+        NULL,   // Default security.
+        0,      // Default stack size
+        &ThreadLocalRegistryImpl::WatcherThreadFunc,
+        reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
+        CREATE_SUSPENDED,
+        &watcher_thread_id);
+    GTEST_CHECK_(watcher_thread != NULL);
+    // Give the watcher thread the same priority as ours to avoid being
+    // blocked by it.
+    ::SetThreadPriority(watcher_thread,
+                        ::GetThreadPriority(::GetCurrentThread()));
+    ::ResumeThread(watcher_thread);
+    ::CloseHandle(watcher_thread);
+  }
+
+  // Monitors exit from a given thread and notifies those
+  // ThreadIdToThreadLocals about thread termination.
+  static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
+    const ThreadIdAndHandle* tah =
+        reinterpret_cast<const ThreadIdAndHandle*>(param);
+    GTEST_CHECK_(
+        ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+    OnThreadExit(tah->first);
+    ::CloseHandle(tah->second);
+    delete tah;
+    return 0;
+  }
+
+  // Returns map of thread local instances.
+  static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() {
+    mutex_.AssertHeld();
+    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals;
+    return map;
+  }
+
+  // Protects access to GetThreadLocalsMapLocked() and its return value.
+  static Mutex mutex_;
+  // Protects access to GetThreadMapLocked() and its return value.
+  static Mutex thread_map_mutex_;
+};
+
+Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);
+
+ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
+  return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
+      thread_local_instance);
+}
+
+void ThreadLocalRegistry::OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance) {
+  ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
+}
+
+#endif  // GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+#if GTEST_USES_POSIX_RE
+
+// Implements RE.  Currently only needed for death tests.
+
+RE::~RE() {
+  if (is_valid_) {
+    // regfree'ing an invalid regex might crash because the content
+    // of the regex is undefined. Since the regex's are essentially
+    // the same, one cannot be valid (or invalid) without the other
+    // being so too.
+    regfree(&partial_regex_);
+    regfree(&full_regex_);
+  }
+  free(const_cast<char*>(pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = posix::StrDup(regex);
+
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match.
+  const size_t full_regex_len = strlen(regex) + 10;
+  char* const full_pattern = new char[full_regex_len];
+
+  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
+  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
+  // We want to call regcomp(&partial_regex_, ...) even if the
+  // previous expression returns false.  Otherwise partial_regex_ may
+  // not be properly initialized can may cause trouble when it's
+  // freed.
+  //
+  // Some implementation of POSIX regex (e.g. on at least some
+  // versions of Cygwin) doesn't accept the empty string as a valid
+  // regex.  We change it to an equivalent form "()" to be safe.
+  if (is_valid_) {
+    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
+    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
+  }
+  EXPECT_TRUE(is_valid_)
+      << "Regular expression \"" << regex
+      << "\" is not a valid POSIX Extended regular expression.";
+
+  delete[] full_pattern;
+}
+
+#elif GTEST_USES_SIMPLE_RE
+
+// Returns true iff ch appears anywhere in str (excluding the
+// terminating '\0' character).
+bool IsInSet(char ch, const char* str) {
+  return ch != '\0' && strchr(str, ch) != NULL;
+}
+
+// Returns true iff ch belongs to the given classification.  Unlike
+// similar functions in <ctype.h>, these aren't affected by the
+// current locale.
+bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
+bool IsAsciiPunct(char ch) {
+  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
+}
+bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
+bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
+bool IsAsciiWordChar(char ch) {
+  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
+      ('0' <= ch && ch <= '9') || ch == '_';
+}
+
+// Returns true iff "\\c" is a supported escape sequence.
+bool IsValidEscape(char c) {
+  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
+}
+
+// Returns true iff the given atom (specified by escaped and pattern)
+// matches ch.  The result is undefined if the atom is invalid.
+bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
+  if (escaped) {  // "\\p" where p is pattern_char.
+    switch (pattern_char) {
+      case 'd': return IsAsciiDigit(ch);
+      case 'D': return !IsAsciiDigit(ch);
+      case 'f': return ch == '\f';
+      case 'n': return ch == '\n';
+      case 'r': return ch == '\r';
+      case 's': return IsAsciiWhiteSpace(ch);
+      case 'S': return !IsAsciiWhiteSpace(ch);
+      case 't': return ch == '\t';
+      case 'v': return ch == '\v';
+      case 'w': return IsAsciiWordChar(ch);
+      case 'W': return !IsAsciiWordChar(ch);
+    }
+    return IsAsciiPunct(pattern_char) && pattern_char == ch;
+  }
+
+  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
+}
+
+// Helper function used by ValidateRegex() to format error messages.
+std::string FormatRegexSyntaxError(const char* regex, int index) {
+  return (Message() << "Syntax error at index " << index
+          << " in simple regular expression \"" << regex << "\": ").GetString();
+}
+
+// Generates non-fatal failures and returns false if regex is invalid;
+// otherwise returns true.
+bool ValidateRegex(const char* regex) {
+  if (regex == NULL) {
+    // TODO(wan@google.com): fix the source file location in the
+    // assertion failures to match where the regex is used in user
+    // code.
+    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
+    return false;
+  }
+
+  bool is_valid = true;
+
+  // True iff ?, *, or + can follow the previous atom.
+  bool prev_repeatable = false;
+  for (int i = 0; regex[i]; i++) {
+    if (regex[i] == '\\') {  // An escape sequence
+      i++;
+      if (regex[i] == '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "'\\' cannot appear at the end.";
+        return false;
+      }
+
+      if (!IsValidEscape(regex[i])) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "invalid escape sequence \"\\" << regex[i] << "\".";
+        is_valid = false;
+      }
+      prev_repeatable = true;
+    } else {  // Not an escape sequence.
+      const char ch = regex[i];
+
+      if (ch == '^' && i > 0) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'^' can only appear at the beginning.";
+        is_valid = false;
+      } else if (ch == '$' && regex[i + 1] != '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'$' can only appear at the end.";
+        is_valid = false;
+      } else if (IsInSet(ch, "()[]{}|")) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' is unsupported.";
+        is_valid = false;
+      } else if (IsRepeat(ch) && !prev_repeatable) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' can only follow a repeatable token.";
+        is_valid = false;
+      }
+
+      prev_repeatable = !IsInSet(ch, "^$?*+");
+    }
+  }
+
+  return is_valid;
+}
+
+// Matches a repeated regex atom followed by a valid simple regular
+// expression.  The regex atom is defined as c if escaped is false,
+// or \c otherwise.  repeat is the repetition meta character (?, *,
+// or +).  The behavior is undefined if str contains too many
+// characters to be indexable by size_t, in which case the test will
+// probably time out anyway.  We are fine with this limitation as
+// std::string has it too.
+bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char c, char repeat, const char* regex,
+    const char* str) {
+  const size_t min_count = (repeat == '+') ? 1 : 0;
+  const size_t max_count = (repeat == '?') ? 1 :
+      static_cast<size_t>(-1) - 1;
+  // We cannot call numeric_limits::max() as it conflicts with the
+  // max() macro on Windows.
+
+  for (size_t i = 0; i <= max_count; ++i) {
+    // We know that the atom matches each of the first i characters in str.
+    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
+      // We have enough matches at the head, and the tail matches too.
+      // Since we only care about *whether* the pattern matches str
+      // (as opposed to *how* it matches), there is no need to find a
+      // greedy match.
+      return true;
+    }
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
+      return false;
+  }
+  return false;
+}
+
+// Returns true iff regex matches a prefix of str.  regex must be a
+// valid simple regular expression and not start with "^", or the
+// result is undefined.
+bool MatchRegexAtHead(const char* regex, const char* str) {
+  if (*regex == '\0')  // An empty regex matches a prefix of anything.
+    return true;
+
+  // "$" only matches the end of a string.  Note that regex being
+  // valid guarantees that there's nothing after "$" in it.
+  if (*regex == '$')
+    return *str == '\0';
+
+  // Is the first thing in regex an escape sequence?
+  const bool escaped = *regex == '\\';
+  if (escaped)
+    ++regex;
+  if (IsRepeat(regex[1])) {
+    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
+    // here's an indirect recursion.  It terminates as the regex gets
+    // shorter in each recursion.
+    return MatchRepetitionAndRegexAtHead(
+        escaped, regex[0], regex[1], regex + 2, str);
+  } else {
+    // regex isn't empty, isn't "$", and doesn't start with a
+    // repetition.  We match the first atom of regex with the first
+    // character of str and recurse.
+    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
+        MatchRegexAtHead(regex + 1, str + 1);
+  }
+}
+
+// Returns true iff regex matches any substring of str.  regex must be
+// a valid simple regular expression, or the result is undefined.
+//
+// The algorithm is recursive, but the recursion depth doesn't exceed
+// the regex length, so we won't need to worry about running out of
+// stack space normally.  In rare cases the time complexity can be
+// exponential with respect to the regex length + the string length,
+// but usually it's must faster (often close to linear).
+bool MatchRegexAnywhere(const char* regex, const char* str) {
+  if (regex == NULL || str == NULL)
+    return false;
+
+  if (*regex == '^')
+    return MatchRegexAtHead(regex + 1, str);
+
+  // A successful match can be anywhere in str.
+  do {
+    if (MatchRegexAtHead(regex, str))
+      return true;
+  } while (*str++ != '\0');
+  return false;
+}
+
+// Implements the RE class.
+
+RE::~RE() {
+  free(const_cast<char*>(pattern_));
+  free(const_cast<char*>(full_pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = full_pattern_ = NULL;
+  if (regex != NULL) {
+    pattern_ = posix::StrDup(regex);
+  }
+
+  is_valid_ = ValidateRegex(regex);
+  if (!is_valid_) {
+    // No need to calculate the full pattern when the regex is invalid.
+    return;
+  }
+
+  const size_t len = strlen(regex);
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match: we need space to prepend a '^', append a '$', and
+  // terminate the string with '\0'.
+  char* buffer = static_cast<char*>(malloc(len + 3));
+  full_pattern_ = buffer;
+
+  if (*regex != '^')
+    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
+
+  // We don't use snprintf or strncpy, as they trigger a warning when
+  // compiled with VC++ 8.0.
+  memcpy(buffer, regex, len);
+  buffer += len;
+
+  if (len == 0 || regex[len - 1] != '$')
+    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
+
+  *buffer = '\0';
+}
+
+#endif  // GTEST_USES_POSIX_RE
+
+const char kUnknownFile[] = "unknown file";
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
+  const std::string file_name(file == NULL ? kUnknownFile : file);
+
+  if (line < 0) {
+    return file_name + ":";
+  }
+#ifdef _MSC_VER
+  return file_name + "(" + StreamableToString(line) + "):";
+#else
+  return file_name + ":" + StreamableToString(line) + ":";
+#endif  // _MSC_VER
+}
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+// Note that FormatCompilerIndependentFileLocation() does NOT append colon
+// to the file location it produces, unlike FormatFileLocation().
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
+    const char* file, int line) {
+  const std::string file_name(file == NULL ? kUnknownFile : file);
+
+  if (line < 0)
+    return file_name;
+  else
+    return file_name + ":" + StreamableToString(line);
+}
+
+GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
+    : severity_(severity) {
+  const char* const marker =
+      severity == GTEST_INFO ?    "[  INFO ]" :
+      severity == GTEST_WARNING ? "[WARNING]" :
+      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
+  GetStream() << ::std::endl << marker << " "
+              << FormatFileLocation(file, line).c_str() << ": ";
+}
+
+// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+GTestLog::~GTestLog() {
+  GetStream() << ::std::endl;
+  if (severity_ == GTEST_FATAL) {
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+// Disable Microsoft deprecation warnings for POSIX functions called from
+// this class (creat, dup, dup2, and close)
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Object that captures an output stream (stdout/stderr).
+class CapturedStream {
+ public:
+  // The ctor redirects the stream to a temporary file.
+  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
+# if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+
+    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
+    const UINT success = ::GetTempFileNameA(temp_dir_path,
+                                            "gtest_redir",
+                                            0,  // Generate unique file name.
+                                            temp_file_path);
+    GTEST_CHECK_(success != 0)
+        << "Unable to create a temporary file in " << temp_dir_path;
+    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
+    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
+                                    << temp_file_path;
+    filename_ = temp_file_path;
+# else
+    // There's no guarantee that a test has write access to the current
+    // directory, so we create the temporary file in the /tmp directory
+    // instead. We use /tmp on most systems, and /sdcard on Android.
+    // That's because Android doesn't have /tmp.
+#  if GTEST_OS_LINUX_ANDROID
+    // Note: Android applications are expected to call the framework's
+    // Context.getExternalStorageDirectory() method through JNI to get
+    // the location of the world-writable SD Card directory. However,
+    // this requires a Context handle, which cannot be retrieved
+    // globally from native code. Doing so also precludes running the
+    // code as part of a regular standalone executable, which doesn't
+    // run in a Dalvik process (e.g. when running it through 'adb shell').
+    //
+    // The location /sdcard is directly accessible from native code
+    // and is the only location (unofficially) supported by the Android
+    // team. It's generally a symlink to the real SD Card mount point
+    // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or
+    // other OEM-customized locations. Never rely on these, and always
+    // use /sdcard.
+    char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX";
+#  else
+    char name_template[] = "/tmp/captured_stream.XXXXXX";
+#  endif  // GTEST_OS_LINUX_ANDROID
+    const int captured_fd = mkstemp(name_template);
+    filename_ = name_template;
+# endif  // GTEST_OS_WINDOWS
+    fflush(NULL);
+    dup2(captured_fd, fd_);
+    close(captured_fd);
+  }
+
+  ~CapturedStream() {
+    remove(filename_.c_str());
+  }
+
+  std::string GetCapturedString() {
+    if (uncaptured_fd_ != -1) {
+      // Restores the original stream.
+      fflush(NULL);
+      dup2(uncaptured_fd_, fd_);
+      close(uncaptured_fd_);
+      uncaptured_fd_ = -1;
+    }
+
+    FILE* const file = posix::FOpen(filename_.c_str(), "r");
+    const std::string content = ReadEntireFile(file);
+    posix::FClose(file);
+    return content;
+  }
+
+ private:
+  const int fd_;  // A stream to capture.
+  int uncaptured_fd_;
+  // Name of the temporary file holding the stderr output.
+  ::std::string filename_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+};
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+static CapturedStream* g_captured_stderr = NULL;
+static CapturedStream* g_captured_stdout = NULL;
+
+// Starts capturing an output stream (stdout/stderr).
+void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
+  if (*stream != NULL) {
+    GTEST_LOG_(FATAL) << "Only one " << stream_name
+                      << " capturer can exist at a time.";
+  }
+  *stream = new CapturedStream(fd);
+}
+
+// Stops capturing the output stream and returns the captured string.
+std::string GetCapturedStream(CapturedStream** captured_stream) {
+  const std::string content = (*captured_stream)->GetCapturedString();
+
+  delete *captured_stream;
+  *captured_stream = NULL;
+
+  return content;
+}
+
+// Starts capturing stdout.
+void CaptureStdout() {
+  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
+}
+
+// Starts capturing stderr.
+void CaptureStderr() {
+  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
+}
+
+// Stops capturing stdout and returns the captured string.
+std::string GetCapturedStdout() {
+  return GetCapturedStream(&g_captured_stdout);
+}
+
+// Stops capturing stderr and returns the captured string.
+std::string GetCapturedStderr() {
+  return GetCapturedStream(&g_captured_stderr);
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+std::string TempDir() {
+#if GTEST_OS_WINDOWS_MOBILE
+  return "\\temp\\";
+#elif GTEST_OS_WINDOWS
+  const char* temp_dir = posix::GetEnv("TEMP");
+  if (temp_dir == NULL || temp_dir[0] == '\0')
+    return "\\temp\\";
+  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
+    return temp_dir;
+  else
+    return std::string(temp_dir) + "\\";
+#elif GTEST_OS_LINUX_ANDROID
+  return "/sdcard/";
+#else
+  return "/tmp/";
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+size_t GetFileSize(FILE* file) {
+  fseek(file, 0, SEEK_END);
+  return static_cast<size_t>(ftell(file));
+}
+
+std::string ReadEntireFile(FILE* file) {
+  const size_t file_size = GetFileSize(file);
+  char* const buffer = new char[file_size];
+
+  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
+  size_t bytes_read = 0;       // # of bytes read so far
+
+  fseek(file, 0, SEEK_SET);
+
+  // Keeps reading the file until we cannot read further or the
+  // pre-determined file size is reached.
+  do {
+    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_read += bytes_last_read;
+  } while (bytes_last_read > 0 && bytes_read < file_size);
+
+  const std::string content(buffer, bytes_read);
+  delete[] buffer;
+
+  return content;
+}
+
+#if GTEST_HAS_DEATH_TEST
+
+static const ::std::vector<testing::internal::string>* g_injected_test_argvs =
+                                        NULL;  // Owned.
+
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>* argvs) {
+  if (g_injected_test_argvs != argvs)
+    delete g_injected_test_argvs;
+  g_injected_test_argvs = argvs;
+}
+
+const ::std::vector<testing::internal::string>& GetInjectableArgvs() {
+  if (g_injected_test_argvs != NULL) {
+    return *g_injected_test_argvs;
+  }
+  return GetArgvs();
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_WINDOWS_MOBILE
+namespace posix {
+void Abort() {
+  DebugBreak();
+  TerminateProcess(GetCurrentProcess(), 1);
+}
+}  // namespace posix
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "GTEST_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char* flag) {
+  const std::string full_flag =
+      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
+
+  Message env_var;
+  for (size_t i = 0; i != full_flag.length(); i++) {
+    env_var << ToUpper(full_flag.c_str()[i]);
+  }
+
+  return env_var.GetString();
+}
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = NULL;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value \"" << str << "\".\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  // Is the parsed value in the range of an Int32?
+  const Int32 result = static_cast<Int32>(long_value);
+  if (long_value == LONG_MAX || long_value == LONG_MIN ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+      // The parsed value overflows as an Int32.
+      ) {
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value " << str << ", which overflows.\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true iff it's not "0".
+bool BoolFromGTestEnv(const char* flag, bool default_value) {
+#if defined(GTEST_GET_BOOL_FROM_ENV_)
+  return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
+#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  return string_value == NULL ?
+      default_value : strcmp(string_value, "0") != 0;
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
+#if defined(GTEST_GET_INT32_FROM_ENV_)
+  return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
+#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  if (string_value == NULL) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  Int32 result = default_value;
+  if (!ParseInt32(Message() << "Environment variable " << env_var,
+                  string_value, &result)) {
+    printf("The default value %s is used.\n",
+           (Message() << default_value).GetString().c_str());
+    fflush(stdout);
+    return default_value;
+  }
+
+  return result;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+std::string StringFromGTestEnv(const char* flag, const char* default_value) {
+#if defined(GTEST_GET_STRING_FROM_ENV_)
+  return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
+#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* value = posix::GetEnv(env_var.c_str());
+  if (value != NULL) {
+    return value;
+  }
+
+  // As a special case for the 'output' flag, if GTEST_OUTPUT is not
+  // set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
+  // system.  The value of XML_OUTPUT_FILE is a filename without the
+  // "xml:" prefix of GTEST_OUTPUT.
+  //
+  // The net priority order after flag processing is thus:
+  //   --gtest_output command line flag
+  //   GTEST_OUTPUT environment variable
+  //   XML_OUTPUT_FILE environment variable
+  //   'default_value'
+  if (strcmp(flag, "output") == 0) {
+    value = posix::GetEnv("XML_OUTPUT_FILE");
+    if (value != NULL) {
+      return std::string("xml:") + value;
+    }
+  }
+  return default_value;
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-printers.cc b/libs/libvpx/third_party/googletest/src/src/gtest-printers.cc
new file mode 100644
index 0000000000..a2df412f8a
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-printers.cc
@@ -0,0 +1,373 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Test - The Google C++ Testing Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// It uses the << operator when possible, and prints the bytes in the
+// object otherwise.  A user can override its behavior for a class
+// type Foo by defining either operator<<(::std::ostream&, const Foo&)
+// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
+// defines Foo.
+
+#include "gtest/gtest-printers.h"
+#include <ctype.h>
+#include <stdio.h>
+#include <cwchar>
+#include <ostream>  // NOLINT
+#include <string>
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+
+namespace {
+
+using ::std::ostream;
+
+// Prints a segment of bytes in the given object.
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
+                                size_t count, ostream* os) {
+  char text[5] = "";
+  for (size_t i = 0; i != count; i++) {
+    const size_t j = start + i;
+    if (i != 0) {
+      // Organizes the bytes into groups of 2 for easy parsing by
+      // human.
+      if ((j % 2) == 0)
+        *os << ' ';
+      else
+        *os << '-';
+    }
+    GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
+    *os << text;
+  }
+}
+
+// Prints the bytes in the given value to the given ostream.
+void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
+                              ostream* os) {
+  // Tells the user how big the object is.
+  *os << count << "-byte object <";
+
+  const size_t kThreshold = 132;
+  const size_t kChunkSize = 64;
+  // If the object size is bigger than kThreshold, we'll have to omit
+  // some details by printing only the first and the last kChunkSize
+  // bytes.
+  // TODO(wan): let the user control the threshold using a flag.
+  if (count < kThreshold) {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
+  } else {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
+    *os << " ... ";
+    // Rounds up to 2-byte boundary.
+    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
+    PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
+  }
+  *os << ">";
+}
+
+}  // namespace
+
+namespace internal2 {
+
+// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
+// given object.  The delegation simplifies the implementation, which
+// uses the << operator and thus is easier done outside of the
+// ::testing::internal namespace, which contains a << operator that
+// sometimes conflicts with the one in STL.
+void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
+                          ostream* os) {
+  PrintBytesInObjectToImpl(obj_bytes, count, os);
+}
+
+}  // namespace internal2
+
+namespace internal {
+
+// Depending on the value of a char (or wchar_t), we print it in one
+// of three formats:
+//   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
+//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
+//   - as a special escape sequence (e.g. '\r', '\n').
+enum CharFormat {
+  kAsIs,
+  kHexEscape,
+  kSpecialEscape
+};
+
+// Returns true if c is a printable ASCII character.  We test the
+// value of c directly instead of calling isprint(), which is buggy on
+// Windows Mobile.
+inline bool IsPrintableAscii(wchar_t c) {
+  return 0x20 <= c && c <= 0x7E;
+}
+
+// Prints a wide or narrow char c as a character literal without the
+// quotes, escaping it when necessary; returns how c was formatted.
+// The template argument UnsignedChar is the unsigned version of Char,
+// which is the type of c.
+template <typename UnsignedChar, typename Char>
+static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
+  switch (static_cast<wchar_t>(c)) {
+    case L'\0':
+      *os << "\\0";
+      break;
+    case L'\'':
+      *os << "\\'";
+      break;
+    case L'\\':
+      *os << "\\\\";
+      break;
+    case L'\a':
+      *os << "\\a";
+      break;
+    case L'\b':
+      *os << "\\b";
+      break;
+    case L'\f':
+      *os << "\\f";
+      break;
+    case L'\n':
+      *os << "\\n";
+      break;
+    case L'\r':
+      *os << "\\r";
+      break;
+    case L'\t':
+      *os << "\\t";
+      break;
+    case L'\v':
+      *os << "\\v";
+      break;
+    default:
+      if (IsPrintableAscii(c)) {
+        *os << static_cast<char>(c);
+        return kAsIs;
+      } else {
+        *os << "\\x" + String::FormatHexInt(static_cast<UnsignedChar>(c));
+        return kHexEscape;
+      }
+  }
+  return kSpecialEscape;
+}
+
+// Prints a wchar_t c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+  switch (c) {
+    case L'\'':
+      *os << "'";
+      return kAsIs;
+    case L'"':
+      *os << "\\\"";
+      return kSpecialEscape;
+    default:
+      return PrintAsCharLiteralTo<wchar_t>(c, os);
+  }
+}
+
+// Prints a char c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
+  return PrintAsStringLiteralTo(
+      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
+}
+
+// Prints a wide or narrow character c and its code.  '\0' is printed
+// as "'\\0'", other unprintable characters are also properly escaped
+// using the standard C++ escape sequence.  The template argument
+// UnsignedChar is the unsigned version of Char, which is the type of c.
+template <typename UnsignedChar, typename Char>
+void PrintCharAndCodeTo(Char c, ostream* os) {
+  // First, print c as a literal in the most readable form we can find.
+  *os << ((sizeof(c) > 1) ? "L'" : "'");
+  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
+  *os << "'";
+
+  // To aid user debugging, we also print c's code in decimal, unless
+  // it's 0 (in which case c was printed as '\\0', making the code
+  // obvious).
+  if (c == 0)
+    return;
+  *os << " (" << static_cast<int>(c);
+
+  // For more convenience, we print c's code again in hexidecimal,
+  // unless c was already printed in the form '\x##' or the code is in
+  // [1, 9].
+  if (format == kHexEscape || (1 <= c && c <= 9)) {
+    // Do nothing.
+  } else {
+    *os << ", 0x" << String::FormatHexInt(static_cast<UnsignedChar>(c));
+  }
+  *os << ")";
+}
+
+void PrintTo(unsigned char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+void PrintTo(signed char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
+void PrintTo(wchar_t wc, ostream* os) {
+  PrintCharAndCodeTo<wchar_t>(wc, os);
+}
+
+// Prints the given array of characters to the ostream.  CharType must be either
+// char or wchar_t.
+// The array starts at begin, the length is len, it may include '\0' characters
+// and may not be NUL-terminated.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+static void PrintCharsAsStringTo(
+    const CharType* begin, size_t len, ostream* os) {
+  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
+  *os << kQuoteBegin;
+  bool is_previous_hex = false;
+  for (size_t index = 0; index < len; ++index) {
+    const CharType cur = begin[index];
+    if (is_previous_hex && IsXDigit(cur)) {
+      // Previous character is of '\x..' form and this character can be
+      // interpreted as another hexadecimal digit in its number. Break string to
+      // disambiguate.
+      *os << "\" " << kQuoteBegin;
+    }
+    is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+  }
+  *os << "\"";
+}
+
+// Prints a (const) char/wchar_t array of 'len' elements, starting at address
+// 'begin'.  CharType must be either char or wchar_t.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+static void UniversalPrintCharArray(
+    const CharType* begin, size_t len, ostream* os) {
+  // The code
+  //   const char kFoo[] = "foo";
+  // generates an array of 4, not 3, elements, with the last one being '\0'.
+  //
+  // Therefore when printing a char array, we don't print the last element if
+  // it's '\0', such that the output matches the string literal as it's
+  // written in the source code.
+  if (len > 0 && begin[len - 1] == '\0') {
+    PrintCharsAsStringTo(begin, len - 1, os);
+    return;
+  }
+
+  // If, however, the last element in the array is not '\0', e.g.
+  //    const char kFoo[] = { 'f', 'o', 'o' };
+  // we must print the entire array.  We also print a message to indicate
+  // that the array is not NUL-terminated.
+  PrintCharsAsStringTo(begin, len, os);
+  *os << " (no terminating NUL)";
+}
+
+// Prints a (const) char array of 'len' elements, starting at address 'begin'.
+void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) wchar_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints the given C string to the ostream.
+void PrintTo(const char* s, ostream* os) {
+  if (s == NULL) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, strlen(s), os);
+  }
+}
+
+// MSVC compiler can be configured to define whar_t as a typedef
+// of unsigned short. Defining an overload for const wchar_t* in that case
+// would cause pointers to unsigned shorts be printed as wide strings,
+// possibly accessing more memory than intended and causing invalid
+// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
+// wchar_t is implemented as a native type.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Prints the given wide C string to the ostream.
+void PrintTo(const wchar_t* s, ostream* os) {
+  if (s == NULL) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, std::wcslen(s), os);
+  }
+}
+#endif  // wchar_t is native
+
+// Prints a ::string object.
+#if GTEST_HAS_GLOBAL_STRING
+void PrintStringTo(const ::string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+void PrintStringTo(const ::std::string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
+// Prints a ::wstring object.
+#if GTEST_HAS_GLOBAL_WSTRING
+void PrintWideStringTo(const ::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+#if GTEST_HAS_STD_WSTRING
+void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+}  // namespace internal
+
+}  // namespace testing
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-test-part.cc b/libs/libvpx/third_party/googletest/src/src/gtest-test-part.cc
new file mode 100644
index 0000000000..fb0e35425e
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-test-part.cc
@@ -0,0 +1,110 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+// The Google C++ Testing Framework (Google Test)
+
+#include "gtest/gtest-test-part.h"
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick exists to
+// prevent the accidental inclusion of gtest-internal-inl.h in the
+// user's code.
+#define GTEST_IMPLEMENTATION_ 1
+#include "src/gtest-internal-inl.h"
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+
+using internal::GetUnitTestImpl;
+
+// Gets the summary of the failure message by omitting the stack trace
+// in it.
+std::string TestPartResult::ExtractSummary(const char* message) {
+  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
+  return stack_trace == NULL ? message :
+      std::string(message, stack_trace);
+}
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
+  return os
+      << result.file_name() << ":" << result.line_number() << ": "
+      << (result.type() == TestPartResult::kSuccess ? "Success" :
+          result.type() == TestPartResult::kFatalFailure ? "Fatal failure" :
+          "Non-fatal failure") << ":\n"
+      << result.message() << std::endl;
+}
+
+// Appends a TestPartResult to the array.
+void TestPartResultArray::Append(const TestPartResult& result) {
+  array_.push_back(result);
+}
+
+// Returns the TestPartResult at the given index (0-based).
+const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
+  if (index < 0 || index >= size()) {
+    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
+    internal::posix::Abort();
+  }
+
+  return array_[index];
+}
+
+// Returns the number of TestPartResult objects in the array.
+int TestPartResultArray::size() const {
+  return static_cast<int>(array_.size());
+}
+
+namespace internal {
+
+HasNewFatalFailureHelper::HasNewFatalFailureHelper()
+    : has_new_fatal_failure_(false),
+      original_reporter_(GetUnitTestImpl()->
+                         GetTestPartResultReporterForCurrentThread()) {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
+}
+
+HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
+      original_reporter_);
+}
+
+void HasNewFatalFailureHelper::ReportTestPartResult(
+    const TestPartResult& result) {
+  if (result.fatally_failed())
+    has_new_fatal_failure_ = true;
+  original_reporter_->ReportTestPartResult(result);
+}
+
+}  // namespace internal
+
+}  // namespace testing
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-typed-test.cc b/libs/libvpx/third_party/googletest/src/src/gtest-typed-test.cc
new file mode 100644
index 0000000000..df1eef4754
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-typed-test.cc
@@ -0,0 +1,118 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+#include "gtest/gtest-typed-test.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// Skips to the first non-space char in str. Returns an empty string if str
+// contains only whitespace characters.
+static const char* SkipSpaces(const char* str) {
+  while (IsSpace(*str))
+    str++;
+  return str;
+}
+
+static std::vector<std::string> SplitIntoTestNames(const char* src) {
+  std::vector<std::string> name_vec;
+  src = SkipSpaces(src);
+  for (; src != NULL; src = SkipComma(src)) {
+    name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src)));
+  }
+  return name_vec;
+}
+
+// Verifies that registered_tests match the test names in
+// registered_tests_; returns registered_tests if successful, or
+// aborts the program otherwise.
+const char* TypedTestCasePState::VerifyRegisteredTestNames(
+    const char* file, int line, const char* registered_tests) {
+  typedef RegisteredTestsMap::const_iterator RegisteredTestIter;
+  registered_ = true;
+
+  std::vector<std::string> name_vec = SplitIntoTestNames(registered_tests);
+
+  Message errors;
+
+  std::set<std::string> tests;
+  for (std::vector<std::string>::const_iterator name_it = name_vec.begin();
+       name_it != name_vec.end(); ++name_it) {
+    const std::string& name = *name_it;
+    if (tests.count(name) != 0) {
+      errors << "Test " << name << " is listed more than once.\n";
+      continue;
+    }
+
+    bool found = false;
+    for (RegisteredTestIter it = registered_tests_.begin();
+         it != registered_tests_.end();
+         ++it) {
+      if (name == it->first) {
+        found = true;
+        break;
+      }
+    }
+
+    if (found) {
+      tests.insert(name);
+    } else {
+      errors << "No test named " << name
+             << " can be found in this test case.\n";
+    }
+  }
+
+  for (RegisteredTestIter it = registered_tests_.begin();
+       it != registered_tests_.end();
+       ++it) {
+    if (tests.count(it->first) == 0) {
+      errors << "You forgot to list test " << it->first << ".\n";
+    }
+  }
+
+  const std::string& errors_str = errors.GetString();
+  if (errors_str != "") {
+    fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+            errors_str.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+
+  return registered_tests;
+}
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest.cc b/libs/libvpx/third_party/googletest/src/src/gtest.cc
new file mode 100644
index 0000000000..5a8932c73e
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/src/gtest.cc
@@ -0,0 +1,5389 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+
+#include "gtest/gtest.h"
+#include "gtest/internal/custom/gtest.h"
+#include "gtest/gtest-spi.h"
+
+#include <ctype.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <map>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <vector>
+
+#if GTEST_OS_LINUX
+
+// TODO(kenton@google.com): Use autoconf to detect availability of
+// gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+# include <fcntl.h>  // NOLINT
+# include <limits.h>  // NOLINT
+# include <sched.h>  // NOLINT
+// Declares vsnprintf().  This header is not available on Windows.
+# include <strings.h>  // NOLINT
+# include <sys/mman.h>  // NOLINT
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+# include <string>
+
+#elif GTEST_OS_SYMBIAN
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+#elif GTEST_OS_ZOS
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+// On z/OS we additionally need strings.h for strcasecmp.
+# include <strings.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
+
+# include <windows.h>  // NOLINT
+# undef min
+
+#elif GTEST_OS_WINDOWS  // We are on Windows proper.
+
+# include <io.h>  // NOLINT
+# include <sys/timeb.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+# include <sys/stat.h>  // NOLINT
+
+# if GTEST_OS_WINDOWS_MINGW
+// MinGW has gettimeofday() but not _ftime64().
+// TODO(kenton@google.com): Use autoconf to detect availability of
+//   gettimeofday().
+// TODO(kenton@google.com): There are other ways to get the time on
+//   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
+//   supports these.  consider using them instead.
+#  define GTEST_HAS_GETTIMEOFDAY_ 1
+#  include <sys/time.h>  // NOLINT
+# endif  // GTEST_OS_WINDOWS_MINGW
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <windows.h>  // NOLINT
+# undef min
+
+#else
+
+// Assume other platforms have gettimeofday().
+// TODO(kenton@google.com): Use autoconf to detect availability of
+//   gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+# include <sys/socket.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+#endif
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#include "src/gtest-internal-inl.h"
+#undef GTEST_IMPLEMENTATION_
+
+#if GTEST_OS_WINDOWS
+# define vsnprintf _vsnprintf
+#endif  // GTEST_OS_WINDOWS
+
+namespace testing {
+
+using internal::CountIf;
+using internal::ForEach;
+using internal::GetElementOr;
+using internal::Shuffle;
+
+// Constants.
+
+// A test whose test case name or test name matches this filter is
+// disabled and not run.
+static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
+
+// A test case whose name matches this filter is considered a death
+// test case and will be run before test cases whose name doesn't
+// match this filter.
+static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
+
+// A test filter that matches everything.
+static const char kUniversalFilter[] = "*";
+
+// The default output file for XML output.
+static const char kDefaultOutputFile[] = "test_detail.xml";
+
+// The environment variable name for the test shard index.
+static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
+// The environment variable name for the total number of test shards.
+static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
+// The environment variable name for the test shard status file.
+static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
+
+namespace internal {
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+const char kStackTraceMarker[] = "\nStack trace:\n";
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+bool g_help_flag = false;
+
+}  // namespace internal
+
+static const char* GetDefaultFilter() {
+#ifdef GTEST_TEST_FILTER_ENV_VAR_
+  const char* const testbridge_test_only = getenv(GTEST_TEST_FILTER_ENV_VAR_);
+  if (testbridge_test_only != NULL) {
+    return testbridge_test_only;
+  }
+#endif  // GTEST_TEST_FILTER_ENV_VAR_
+  return kUniversalFilter;
+}
+
+GTEST_DEFINE_bool_(
+    also_run_disabled_tests,
+    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    "Run disabled tests too, in addition to the tests normally being run.");
+
+GTEST_DEFINE_bool_(
+    break_on_failure,
+    internal::BoolFromGTestEnv("break_on_failure", false),
+    "True iff a failed assertion should be a debugger break-point.");
+
+GTEST_DEFINE_bool_(
+    catch_exceptions,
+    internal::BoolFromGTestEnv("catch_exceptions", true),
+    "True iff " GTEST_NAME_
+    " should catch exceptions and treat them as test failures.");
+
+GTEST_DEFINE_string_(
+    color,
+    internal::StringFromGTestEnv("color", "auto"),
+    "Whether to use colors in the output.  Valid values: yes, no, "
+    "and auto.  'auto' means to use colors if the output is "
+    "being sent to a terminal and the TERM environment variable "
+    "is set to a terminal type that supports colors.");
+
+GTEST_DEFINE_string_(
+    filter,
+    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    "A colon-separated list of glob (not regex) patterns "
+    "for filtering the tests to run, optionally followed by a "
+    "'-' and a : separated list of negative patterns (tests to "
+    "exclude).  A test is run if it matches one of the positive "
+    "patterns and does not match any of the negative patterns.");
+
+GTEST_DEFINE_bool_(list_tests, false,
+                   "List all tests without running them.");
+
+GTEST_DEFINE_string_(
+    output,
+    internal::StringFromGTestEnv("output", ""),
+    "A format (currently must be \"xml\"), optionally followed "
+    "by a colon and an output file name or directory. A directory "
+    "is indicated by a trailing pathname separator. "
+    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
+    "If a directory is specified, output files will be created "
+    "within that directory, with file-names based on the test "
+    "executable's name and, if necessary, made unique by adding "
+    "digits.");
+
+GTEST_DEFINE_bool_(
+    print_time,
+    internal::BoolFromGTestEnv("print_time", true),
+    "True iff " GTEST_NAME_
+    " should display elapsed time in text output.");
+
+GTEST_DEFINE_int32_(
+    random_seed,
+    internal::Int32FromGTestEnv("random_seed", 0),
+    "Random number seed to use when shuffling test orders.  Must be in range "
+    "[1, 99999], or 0 to use a seed based on the current time.");
+
+GTEST_DEFINE_int32_(
+    repeat,
+    internal::Int32FromGTestEnv("repeat", 1),
+    "How many times to repeat each test.  Specify a negative number "
+    "for repeating forever.  Useful for shaking out flaky tests.");
+
+GTEST_DEFINE_bool_(
+    show_internal_stack_frames, false,
+    "True iff " GTEST_NAME_ " should include internal stack frames when "
+    "printing test failure stack traces.");
+
+GTEST_DEFINE_bool_(
+    shuffle,
+    internal::BoolFromGTestEnv("shuffle", false),
+    "True iff " GTEST_NAME_
+    " should randomize tests' order on every run.");
+
+GTEST_DEFINE_int32_(
+    stack_trace_depth,
+    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    "The maximum number of stack frames to print when an "
+    "assertion fails.  The valid range is 0 through 100, inclusive.");
+
+GTEST_DEFINE_string_(
+    stream_result_to,
+    internal::StringFromGTestEnv("stream_result_to", ""),
+    "This flag specifies the host name and the port number on which to stream "
+    "test results. Example: \"localhost:555\". The flag is effective only on "
+    "Linux.");
+
+GTEST_DEFINE_bool_(
+    throw_on_failure,
+    internal::BoolFromGTestEnv("throw_on_failure", false),
+    "When this flag is specified, a failed assertion will throw an exception "
+    "if exceptions are enabled or exit the program with a non-zero code "
+    "otherwise.");
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DEFINE_string_(
+    flagfile,
+    internal::StringFromGTestEnv("flagfile", ""),
+    "This flag specifies the flagfile to read command-line flags from.");
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+namespace internal {
+
+// Generates a random number from [0, range), using a Linear
+// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
+// than kMaxRange.
+GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
+UInt32 Random::Generate(UInt32 range) {
+  // These constants are the same as are used in glibc's rand(3).
+  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
+
+  GTEST_CHECK_(range > 0)
+      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range <= kMaxRange)
+      << "Generation of a number in [0, " << range << ") was requested, "
+      << "but this can only generate numbers in [0, " << kMaxRange << ").";
+
+  // Converting via modulus introduces a bit of downward bias, but
+  // it's simple, and a linear congruential generator isn't too good
+  // to begin with.
+  return state_ % range;
+}
+
+// GTestIsInitialized() returns true iff the user has initialized
+// Google Test.  Useful for catching the user mistake of not initializing
+// Google Test before calling RUN_ALL_TESTS().
+static bool GTestIsInitialized() { return GetArgvs().size() > 0; }
+
+// Iterates over a vector of TestCases, keeping a running sum of the
+// results of calling a given int-returning method on each.
+// Returns the sum.
+static int SumOverTestCaseList(const std::vector<TestCase*>& case_list,
+                               int (TestCase::*method)() const) {
+  int sum = 0;
+  for (size_t i = 0; i < case_list.size(); i++) {
+    sum += (case_list[i]->*method)();
+  }
+  return sum;
+}
+
+// Returns true iff the test case passed.
+static bool TestCasePassed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Passed();
+}
+
+// Returns true iff the test case failed.
+static bool TestCaseFailed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Failed();
+}
+
+// Returns true iff test_case contains at least one test that should
+// run.
+static bool ShouldRunTestCase(const TestCase* test_case) {
+  return test_case->should_run();
+}
+
+// AssertHelper constructor.
+AssertHelper::AssertHelper(TestPartResult::Type type,
+                           const char* file,
+                           int line,
+                           const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {
+}
+
+AssertHelper::~AssertHelper() {
+  delete data_;
+}
+
+// Message assignment, for assertion streaming support.
+void AssertHelper::operator=(const Message& message) const {
+  UnitTest::GetInstance()->
+    AddTestPartResult(data_->type, data_->file, data_->line,
+                      AppendUserMessage(data_->message, message),
+                      UnitTest::GetInstance()->impl()
+                      ->CurrentOsStackTraceExceptTop(1)
+                      // Skips the stack frame for this function itself.
+                      );  // NOLINT
+}
+
+// Mutex for linked pointers.
+GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+::std::vector<testing::internal::string> g_argvs;
+
+const ::std::vector<testing::internal::string>& GetArgvs() {
+#if defined(GTEST_CUSTOM_GET_ARGVS_)
+  return GTEST_CUSTOM_GET_ARGVS_();
+#else  // defined(GTEST_CUSTOM_GET_ARGVS_)
+  return g_argvs;
+#endif  // defined(GTEST_CUSTOM_GET_ARGVS_)
+}
+
+// Returns the current application's name, removing directory path if that
+// is present.
+FilePath GetCurrentExecutableName() {
+  FilePath result;
+
+#if GTEST_OS_WINDOWS
+  result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe"));
+#else
+  result.Set(FilePath(GetArgvs()[0]));
+#endif  // GTEST_OS_WINDOWS
+
+  return result.RemoveDirectoryName();
+}
+
+// Functions for processing the gtest_output flag.
+
+// Returns the output format, or "" for normal printed output.
+std::string UnitTestOptions::GetOutputFormat() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL) return std::string("");
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  return (colon == NULL) ?
+      std::string(gtest_output_flag) :
+      std::string(gtest_output_flag, colon - gtest_output_flag);
+}
+
+// Returns the name of the requested output file, or the default if none
+// was explicitly specified.
+std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL)
+    return "";
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  if (colon == NULL)
+    return internal::FilePath::ConcatPaths(
+        internal::FilePath(
+            UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(kDefaultOutputFile)).string();
+
+  internal::FilePath output_name(colon + 1);
+  if (!output_name.IsAbsolutePath())
+    // TODO(wan@google.com): on Windows \some\path is not an absolute
+    // path (as its meaning depends on the current drive), yet the
+    // following logic for turning it into an absolute path is wrong.
+    // Fix it.
+    output_name = internal::FilePath::ConcatPaths(
+        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(colon + 1));
+
+  if (!output_name.IsDirectory())
+    return output_name.string();
+
+  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
+      output_name, internal::GetCurrentExecutableName(),
+      GetOutputFormat().c_str()));
+  return result.string();
+}
+
+// Returns true iff the wildcard pattern matches the string.  The
+// first ':' or '\0' character in pattern marks the end of it.
+//
+// This recursive algorithm isn't very efficient, but is clear and
+// works well enough for matching test names, which are short.
+bool UnitTestOptions::PatternMatchesString(const char *pattern,
+                                           const char *str) {
+  switch (*pattern) {
+    case '\0':
+    case ':':  // Either ':' or '\0' marks the end of the pattern.
+      return *str == '\0';
+    case '?':  // Matches any single character.
+      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
+    case '*':  // Matches any string (possibly empty) of characters.
+      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
+          PatternMatchesString(pattern + 1, str);
+    default:  // Non-special character.  Matches itself.
+      return *pattern == *str &&
+          PatternMatchesString(pattern + 1, str + 1);
+  }
+}
+
+bool UnitTestOptions::MatchesFilter(
+    const std::string& name, const char* filter) {
+  const char *cur_pattern = filter;
+  for (;;) {
+    if (PatternMatchesString(cur_pattern, name.c_str())) {
+      return true;
+    }
+
+    // Finds the next pattern in the filter.
+    cur_pattern = strchr(cur_pattern, ':');
+
+    // Returns if no more pattern can be found.
+    if (cur_pattern == NULL) {
+      return false;
+    }
+
+    // Skips the pattern separater (the ':' character).
+    cur_pattern++;
+  }
+}
+
+// Returns true iff the user-specified filter matches the test case
+// name and the test name.
+bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name,
+                                        const std::string &test_name) {
+  const std::string& full_name = test_case_name + "." + test_name.c_str();
+
+  // Split --gtest_filter at '-', if there is one, to separate into
+  // positive filter and negative filter portions
+  const char* const p = GTEST_FLAG(filter).c_str();
+  const char* const dash = strchr(p, '-');
+  std::string positive;
+  std::string negative;
+  if (dash == NULL) {
+    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
+    negative = "";
+  } else {
+    positive = std::string(p, dash);   // Everything up to the dash
+    negative = std::string(dash + 1);  // Everything after the dash
+    if (positive.empty()) {
+      // Treat '-test1' as the same as '*-test1'
+      positive = kUniversalFilter;
+    }
+  }
+
+  // A filter is a colon-separated list of patterns.  It matches a
+  // test if any pattern in it matches the test.
+  return (MatchesFilter(full_name, positive.c_str()) &&
+          !MatchesFilter(full_name, negative.c_str()));
+}
+
+#if GTEST_HAS_SEH
+// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+// This function is useful as an __except condition.
+int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
+  // Google Test should handle a SEH exception if:
+  //   1. the user wants it to, AND
+  //   2. this is not a breakpoint exception, AND
+  //   3. this is not a C++ exception (VC++ implements them via SEH,
+  //      apparently).
+  //
+  // SEH exception code for C++ exceptions.
+  // (see http://support.microsoft.com/kb/185294 for more information).
+  const DWORD kCxxExceptionCode = 0xe06d7363;
+
+  bool should_handle = true;
+
+  if (!GTEST_FLAG(catch_exceptions))
+    should_handle = false;
+  else if (exception_code == EXCEPTION_BREAKPOINT)
+    should_handle = false;
+  else if (exception_code == kCxxExceptionCode)
+    should_handle = false;
+
+  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
+}
+#endif  // GTEST_HAS_SEH
+
+}  // namespace internal
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results. Intercepts only failures from the current thread.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    TestPartResultArray* result)
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
+      result_(result) {
+  Init();
+}
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    InterceptMode intercept_mode, TestPartResultArray* result)
+    : intercept_mode_(intercept_mode),
+      result_(result) {
+  Init();
+}
+
+void ScopedFakeTestPartResultReporter::Init() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    old_reporter_ = impl->GetGlobalTestPartResultReporter();
+    impl->SetGlobalTestPartResultReporter(this);
+  } else {
+    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
+    impl->SetTestPartResultReporterForCurrentThread(this);
+  }
+}
+
+// The d'tor restores the test part result reporter used by Google Test
+// before.
+ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    impl->SetGlobalTestPartResultReporter(old_reporter_);
+  } else {
+    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
+  }
+}
+
+// Increments the test part result count and remembers the result.
+// This method is from the TestPartResultReporterInterface interface.
+void ScopedFakeTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  result_->Append(result);
+}
+
+namespace internal {
+
+// Returns the type ID of ::testing::Test.  We should always call this
+// instead of GetTypeId< ::testing::Test>() to get the type ID of
+// testing::Test.  This is to work around a suspected linker bug when
+// using Google Test as a framework on Mac OS X.  The bug causes
+// GetTypeId< ::testing::Test>() to return different values depending
+// on whether the call is from the Google Test framework itself or
+// from user test code.  GetTestTypeId() is guaranteed to always
+// return the same value, as it always calls GetTypeId<>() from the
+// gtest.cc, which is within the Google Test framework.
+TypeId GetTestTypeId() {
+  return GetTypeId<Test>();
+}
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
+
+// This predicate-formatter checks that 'results' contains a test part
+// failure of the given type and that the failure message contains the
+// given substring.
+AssertionResult HasOneFailure(const char* /* results_expr */,
+                              const char* /* type_expr */,
+                              const char* /* substr_expr */,
+                              const TestPartResultArray& results,
+                              TestPartResult::Type type,
+                              const string& substr) {
+  const std::string expected(type == TestPartResult::kFatalFailure ?
+                        "1 fatal failure" :
+                        "1 non-fatal failure");
+  Message msg;
+  if (results.size() != 1) {
+    msg << "Expected: " << expected << "\n"
+        << "  Actual: " << results.size() << " failures";
+    for (int i = 0; i < results.size(); i++) {
+      msg << "\n" << results.GetTestPartResult(i);
+    }
+    return AssertionFailure() << msg;
+  }
+
+  const TestPartResult& r = results.GetTestPartResult(0);
+  if (r.type() != type) {
+    return AssertionFailure() << "Expected: " << expected << "\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  if (strstr(r.message(), substr.c_str()) == NULL) {
+    return AssertionFailure() << "Expected: " << expected << " containing \""
+                              << substr << "\"\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  return AssertionSuccess();
+}
+
+// The constructor of SingleFailureChecker remembers where to look up
+// test part results, what type of failure we expect, and what
+// substring the failure message should contain.
+SingleFailureChecker:: SingleFailureChecker(
+    const TestPartResultArray* results,
+    TestPartResult::Type type,
+    const string& substr)
+    : results_(results),
+      type_(type),
+      substr_(substr) {}
+
+// The destructor of SingleFailureChecker verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+SingleFailureChecker::~SingleFailureChecker() {
+  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
+}
+
+DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->current_test_result()->AddTestPartResult(result);
+  unit_test_->listeners()->repeater()->OnTestPartResult(result);
+}
+
+DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
+}
+
+// Returns the global test part result reporter.
+TestPartResultReporterInterface*
+UnitTestImpl::GetGlobalTestPartResultReporter() {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  return global_test_part_result_repoter_;
+}
+
+// Sets the global test part result reporter.
+void UnitTestImpl::SetGlobalTestPartResultReporter(
+    TestPartResultReporterInterface* reporter) {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  global_test_part_result_repoter_ = reporter;
+}
+
+// Returns the test part result reporter for the current thread.
+TestPartResultReporterInterface*
+UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
+  return per_thread_test_part_result_reporter_.get();
+}
+
+// Sets the test part result reporter for the current thread.
+void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
+    TestPartResultReporterInterface* reporter) {
+  per_thread_test_part_result_reporter_.set(reporter);
+}
+
+// Gets the number of successful test cases.
+int UnitTestImpl::successful_test_case_count() const {
+  return CountIf(test_cases_, TestCasePassed);
+}
+
+// Gets the number of failed test cases.
+int UnitTestImpl::failed_test_case_count() const {
+  return CountIf(test_cases_, TestCaseFailed);
+}
+
+// Gets the number of all test cases.
+int UnitTestImpl::total_test_case_count() const {
+  return static_cast<int>(test_cases_.size());
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTestImpl::test_case_to_run_count() const {
+  return CountIf(test_cases_, ShouldRunTestCase);
+}
+
+// Gets the number of successful tests.
+int UnitTestImpl::successful_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count);
+}
+
+// Gets the number of failed tests.
+int UnitTestImpl::failed_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTestImpl::reportable_disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_,
+                             &TestCase::reportable_disabled_test_count);
+}
+
+// Gets the number of disabled tests.
+int UnitTestImpl::disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTestImpl::reportable_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count);
+}
+
+// Gets the number of all tests.
+int UnitTestImpl::total_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
+}
+
+// Gets the number of tests that should run.
+int UnitTestImpl::test_to_run_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
+  return os_stack_trace_getter()->CurrentStackTrace(
+      static_cast<int>(GTEST_FLAG(stack_trace_depth)),
+      skip_count + 1
+      // Skips the user-specified number of frames plus this function
+      // itself.
+      );  // NOLINT
+}
+
+// Returns the current time in milliseconds.
+TimeInMillis GetTimeInMillis() {
+#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
+  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
+  // http://analogous.blogspot.com/2005/04/epoch.html
+  const TimeInMillis kJavaEpochToWinFileTimeDelta =
+    static_cast<TimeInMillis>(116444736UL) * 100000UL;
+  const DWORD kTenthMicrosInMilliSecond = 10000;
+
+  SYSTEMTIME now_systime;
+  FILETIME now_filetime;
+  ULARGE_INTEGER now_int64;
+  // TODO(kenton@google.com): Shouldn't this just use
+  //   GetSystemTimeAsFileTime()?
+  GetSystemTime(&now_systime);
+  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
+    now_int64.LowPart = now_filetime.dwLowDateTime;
+    now_int64.HighPart = now_filetime.dwHighDateTime;
+    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
+      kJavaEpochToWinFileTimeDelta;
+    return now_int64.QuadPart;
+  }
+  return 0;
+#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
+  __timeb64 now;
+
+  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
+  // (deprecated function) there.
+  // TODO(kenton@google.com): Use GetTickCount()?  Or use
+  //   SystemTimeToFileTime()
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+  _ftime64(&now);
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
+#elif GTEST_HAS_GETTIMEOFDAY_
+  struct timeval now;
+  gettimeofday(&now, NULL);
+  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
+#else
+# error "Don't know how to get the current time on your system."
+#endif
+}
+
+// Utilities
+
+// class String.
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Creates a UTF-16 wide string from the given ANSI string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the wide string, or NULL if the
+// input is NULL.
+LPCWSTR String::AnsiToUtf16(const char* ansi) {
+  if (!ansi) return NULL;
+  const int length = strlen(ansi);
+  const int unicode_length =
+      MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                          NULL, 0);
+  WCHAR* unicode = new WCHAR[unicode_length + 1];
+  MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                      unicode, unicode_length);
+  unicode[unicode_length] = 0;
+  return unicode;
+}
+
+// Creates an ANSI string from the given wide string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the ANSI string, or NULL if the
+// input is NULL.
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
+  if (!utf16_str) return NULL;
+  const int ansi_length =
+      WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                          NULL, 0, NULL, NULL);
+  char* ansi = new char[ansi_length + 1];
+  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                      ansi, ansi_length, NULL, NULL);
+  ansi[ansi_length] = 0;
+  return ansi;
+}
+
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Compares two C strings.  Returns true iff they have the same content.
+//
+// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CStringEquals(const char * lhs, const char * rhs) {
+  if ( lhs == NULL ) return rhs == NULL;
+
+  if ( rhs == NULL ) return false;
+
+  return strcmp(lhs, rhs) == 0;
+}
+
+#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+// Converts an array of wide chars to a narrow string using the UTF-8
+// encoding, and streams the result to the given Message object.
+static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
+                                     Message* msg) {
+  for (size_t i = 0; i != length; ) {  // NOLINT
+    if (wstr[i] != L'\0') {
+      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
+      while (i != length && wstr[i] != L'\0')
+        i++;
+    } else {
+      *msg << '\0';
+      i++;
+    }
+  }
+}
+
+#endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest) {
+  ::std::vector< ::std::string> parsed;
+  ::std::string::size_type pos = 0;
+  while (::testing::internal::AlwaysTrue()) {
+    const ::std::string::size_type colon = str.find(delimiter, pos);
+    if (colon == ::std::string::npos) {
+      parsed.push_back(str.substr(pos));
+      break;
+    } else {
+      parsed.push_back(str.substr(pos, colon - pos));
+      pos = colon + 1;
+    }
+  }
+  dest->swap(parsed);
+}
+
+}  // namespace internal
+
+// Constructs an empty Message.
+// We allocate the stringstream separately because otherwise each use of
+// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
+// stack frame leading to huge stack frames in some cases; gcc does not reuse
+// the stack space.
+Message::Message() : ss_(new ::std::stringstream) {
+  // By default, we want there to be enough precision when printing
+  // a double to a Message.
+  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
+}
+
+// These two overloads allow streaming a wide C string to a Message
+// using the UTF-8 encoding.
+Message& Message::operator <<(const wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+Message& Message::operator <<(wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::std::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Gets the text streamed to this object so far as an std::string.
+// Each '\0' character in the buffer is replaced with "\\0".
+std::string Message::GetString() const {
+  return internal::StringStreamToString(ss_.get());
+}
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+    : success_(other.success_),
+      message_(other.message_.get() != NULL ?
+               new ::std::string(*other.message_) :
+               static_cast< ::std::string*>(NULL)) {
+}
+
+// Swaps two AssertionResults.
+void AssertionResult::swap(AssertionResult& other) {
+  using std::swap;
+  swap(success_, other.success_);
+  swap(message_, other.message_);
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != NULL)
+    negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() {
+  return AssertionResult(true);
+}
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() {
+  return AssertionResult(false);
+}
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+  return AssertionFailure() << message;
+}
+
+namespace internal {
+
+namespace edit_distance {
+std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t>& left,
+                                            const std::vector<size_t>& right) {
+  std::vector<std::vector<double> > costs(
+      left.size() + 1, std::vector<double>(right.size() + 1));
+  std::vector<std::vector<EditType> > best_move(
+      left.size() + 1, std::vector<EditType>(right.size() + 1));
+
+  // Populate for empty right.
+  for (size_t l_i = 0; l_i < costs.size(); ++l_i) {
+    costs[l_i][0] = static_cast<double>(l_i);
+    best_move[l_i][0] = kRemove;
+  }
+  // Populate for empty left.
+  for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) {
+    costs[0][r_i] = static_cast<double>(r_i);
+    best_move[0][r_i] = kAdd;
+  }
+
+  for (size_t l_i = 0; l_i < left.size(); ++l_i) {
+    for (size_t r_i = 0; r_i < right.size(); ++r_i) {
+      if (left[l_i] == right[r_i]) {
+        // Found a match. Consume it.
+        costs[l_i + 1][r_i + 1] = costs[l_i][r_i];
+        best_move[l_i + 1][r_i + 1] = kMatch;
+        continue;
+      }
+
+      const double add = costs[l_i + 1][r_i];
+      const double remove = costs[l_i][r_i + 1];
+      const double replace = costs[l_i][r_i];
+      if (add < remove && add < replace) {
+        costs[l_i + 1][r_i + 1] = add + 1;
+        best_move[l_i + 1][r_i + 1] = kAdd;
+      } else if (remove < add && remove < replace) {
+        costs[l_i + 1][r_i + 1] = remove + 1;
+        best_move[l_i + 1][r_i + 1] = kRemove;
+      } else {
+        // We make replace a little more expensive than add/remove to lower
+        // their priority.
+        costs[l_i + 1][r_i + 1] = replace + 1.00001;
+        best_move[l_i + 1][r_i + 1] = kReplace;
+      }
+    }
+  }
+
+  // Reconstruct the best path. We do it in reverse order.
+  std::vector<EditType> best_path;
+  for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) {
+    EditType move = best_move[l_i][r_i];
+    best_path.push_back(move);
+    l_i -= move != kAdd;
+    r_i -= move != kRemove;
+  }
+  std::reverse(best_path.begin(), best_path.end());
+  return best_path;
+}
+
+namespace {
+
+// Helper class to convert string into ids with deduplication.
+class InternalStrings {
+ public:
+  size_t GetId(const std::string& str) {
+    IdMap::iterator it = ids_.find(str);
+    if (it != ids_.end()) return it->second;
+    size_t id = ids_.size();
+    return ids_[str] = id;
+  }
+
+ private:
+  typedef std::map<std::string, size_t> IdMap;
+  IdMap ids_;
+};
+
+}  // namespace
+
+std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right) {
+  std::vector<size_t> left_ids, right_ids;
+  {
+    InternalStrings intern_table;
+    for (size_t i = 0; i < left.size(); ++i) {
+      left_ids.push_back(intern_table.GetId(left[i]));
+    }
+    for (size_t i = 0; i < right.size(); ++i) {
+      right_ids.push_back(intern_table.GetId(right[i]));
+    }
+  }
+  return CalculateOptimalEdits(left_ids, right_ids);
+}
+
+namespace {
+
+// Helper class that holds the state for one hunk and prints it out to the
+// stream.
+// It reorders adds/removes when possible to group all removes before all
+// adds. It also adds the hunk header before printint into the stream.
+class Hunk {
+ public:
+  Hunk(size_t left_start, size_t right_start)
+      : left_start_(left_start),
+        right_start_(right_start),
+        adds_(),
+        removes_(),
+        common_() {}
+
+  void PushLine(char edit, const char* line) {
+    switch (edit) {
+      case ' ':
+        ++common_;
+        FlushEdits();
+        hunk_.push_back(std::make_pair(' ', line));
+        break;
+      case '-':
+        ++removes_;
+        hunk_removes_.push_back(std::make_pair('-', line));
+        break;
+      case '+':
+        ++adds_;
+        hunk_adds_.push_back(std::make_pair('+', line));
+        break;
+    }
+  }
+
+  void PrintTo(std::ostream* os) {
+    PrintHeader(os);
+    FlushEdits();
+    for (std::list<std::pair<char, const char*> >::const_iterator it =
+             hunk_.begin();
+         it != hunk_.end(); ++it) {
+      *os << it->first << it->second << "\n";
+    }
+  }
+
+  bool has_edits() const { return adds_ || removes_; }
+
+ private:
+  void FlushEdits() {
+    hunk_.splice(hunk_.end(), hunk_removes_);
+    hunk_.splice(hunk_.end(), hunk_adds_);
+  }
+
+  // Print a unified diff header for one hunk.
+  // The format is
+  //   "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
+  // where the left/right parts are ommitted if unnecessary.
+  void PrintHeader(std::ostream* ss) const {
+    *ss << "@@ ";
+    if (removes_) {
+      *ss << "-" << left_start_ << "," << (removes_ + common_);
+    }
+    if (removes_ && adds_) {
+      *ss << " ";
+    }
+    if (adds_) {
+      *ss << "+" << right_start_ << "," << (adds_ + common_);
+    }
+    *ss << " @@\n";
+  }
+
+  size_t left_start_, right_start_;
+  size_t adds_, removes_, common_;
+  std::list<std::pair<char, const char*> > hunk_, hunk_adds_, hunk_removes_;
+};
+
+}  // namespace
+
+// Create a list of diff hunks in Unified diff format.
+// Each hunk has a header generated by PrintHeader above plus a body with
+// lines prefixed with ' ' for no change, '-' for deletion and '+' for
+// addition.
+// 'context' represents the desired unchanged prefix/suffix around the diff.
+// If two hunks are close enough that their contexts overlap, then they are
+// joined into one hunk.
+std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                              const std::vector<std::string>& right,
+                              size_t context) {
+  const std::vector<EditType> edits = CalculateOptimalEdits(left, right);
+
+  size_t l_i = 0, r_i = 0, edit_i = 0;
+  std::stringstream ss;
+  while (edit_i < edits.size()) {
+    // Find first edit.
+    while (edit_i < edits.size() && edits[edit_i] == kMatch) {
+      ++l_i;
+      ++r_i;
+      ++edit_i;
+    }
+
+    // Find the first line to include in the hunk.
+    const size_t prefix_context = std::min(l_i, context);
+    Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1);
+    for (size_t i = prefix_context; i > 0; --i) {
+      hunk.PushLine(' ', left[l_i - i].c_str());
+    }
+
+    // Iterate the edits until we found enough suffix for the hunk or the input
+    // is over.
+    size_t n_suffix = 0;
+    for (; edit_i < edits.size(); ++edit_i) {
+      if (n_suffix >= context) {
+        // Continue only if the next hunk is very close.
+        std::vector<EditType>::const_iterator it = edits.begin() + edit_i;
+        while (it != edits.end() && *it == kMatch) ++it;
+        if (it == edits.end() || (it - edits.begin()) - edit_i >= context) {
+          // There is no next edit or it is too far away.
+          break;
+        }
+      }
+
+      EditType edit = edits[edit_i];
+      // Reset count when a non match is found.
+      n_suffix = edit == kMatch ? n_suffix + 1 : 0;
+
+      if (edit == kMatch || edit == kRemove || edit == kReplace) {
+        hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str());
+      }
+      if (edit == kAdd || edit == kReplace) {
+        hunk.PushLine('+', right[r_i].c_str());
+      }
+
+      // Advance indices, depending on edit type.
+      l_i += edit != kAdd;
+      r_i += edit != kRemove;
+    }
+
+    if (!hunk.has_edits()) {
+      // We are done. We don't want this hunk.
+      break;
+    }
+
+    hunk.PrintTo(&ss);
+  }
+  return ss.str();
+}
+
+}  // namespace edit_distance
+
+namespace {
+
+// The string representation of the values received in EqFailure() are already
+// escaped. Split them on escaped '\n' boundaries. Leave all other escaped
+// characters the same.
+std::vector<std::string> SplitEscapedString(const std::string& str) {
+  std::vector<std::string> lines;
+  size_t start = 0, end = str.size();
+  if (end > 2 && str[0] == '"' && str[end - 1] == '"') {
+    ++start;
+    --end;
+  }
+  bool escaped = false;
+  for (size_t i = start; i + 1 < end; ++i) {
+    if (escaped) {
+      escaped = false;
+      if (str[i] == 'n') {
+        lines.push_back(str.substr(start, i - start - 1));
+        start = i + 1;
+      }
+    } else {
+      escaped = str[i] == '\\';
+    }
+  }
+  lines.push_back(str.substr(start, end - start));
+  return lines;
+}
+
+}  // namespace
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   lhs_expression: "foo"
+//   rhs_expression: "bar"
+//   lhs_value:      "5"
+//   rhs_value:      "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string "Ignoring case" will
+// be inserted into the message.
+AssertionResult EqFailure(const char* lhs_expression,
+                          const char* rhs_expression,
+                          const std::string& lhs_value,
+                          const std::string& rhs_value,
+                          bool ignoring_case) {
+  Message msg;
+  msg << "      Expected: " << lhs_expression;
+  if (lhs_value != lhs_expression) {
+    msg << "\n      Which is: " << lhs_value;
+  }
+  msg << "\nTo be equal to: " << rhs_expression;
+  if (rhs_value != rhs_expression) {
+    msg << "\n      Which is: " << rhs_value;
+  }
+
+  if (ignoring_case) {
+    msg << "\nIgnoring case";
+  }
+
+  if (!lhs_value.empty() && !rhs_value.empty()) {
+    const std::vector<std::string> lhs_lines =
+        SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines =
+        SplitEscapedString(rhs_value);
+    if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
+      msg << "\nWith diff:\n"
+          << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
+    }
+  }
+
+  return AssertionFailure() << msg;
+}
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value) {
+  const char* actual_message = assertion_result.message();
+  Message msg;
+  msg << "Value of: " << expression_text
+      << "\n  Actual: " << actual_predicate_value;
+  if (actual_message[0] != '\0')
+    msg << " (" << actual_message << ")";
+  msg << "\nExpected: " << expected_predicate_value;
+  return msg.GetString();
+}
+
+// Helper function for implementing ASSERT_NEAR.
+AssertionResult DoubleNearPredFormat(const char* expr1,
+                                     const char* expr2,
+                                     const char* abs_error_expr,
+                                     double val1,
+                                     double val2,
+                                     double abs_error) {
+  const double diff = fabs(val1 - val2);
+  if (diff <= abs_error) return AssertionSuccess();
+
+  // TODO(wan): do not print the value of an expression if it's
+  // already a literal.
+  return AssertionFailure()
+      << "The difference between " << expr1 << " and " << expr2
+      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
+      << expr1 << " evaluates to " << val1 << ",\n"
+      << expr2 << " evaluates to " << val2 << ", and\n"
+      << abs_error_expr << " evaluates to " << abs_error << ".";
+}
+
+
+// Helper template for implementing FloatLE() and DoubleLE().
+template <typename RawType>
+AssertionResult FloatingPointLE(const char* expr1,
+                                const char* expr2,
+                                RawType val1,
+                                RawType val2) {
+  // Returns success if val1 is less than val2,
+  if (val1 < val2) {
+    return AssertionSuccess();
+  }
+
+  // or if val1 is almost equal to val2.
+  const FloatingPoint<RawType> lhs(val1), rhs(val2);
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  // Note that the above two checks will both fail if either val1 or
+  // val2 is NaN, as the IEEE floating-point standard requires that
+  // any predicate involving a NaN must return false.
+
+  ::std::stringstream val1_ss;
+  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val1;
+
+  ::std::stringstream val2_ss;
+  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val2;
+
+  return AssertionFailure()
+      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+      << StringStreamToString(&val2_ss);
+}
+
+}  // namespace internal
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult FloatLE(const char* expr1, const char* expr2,
+                        float val1, float val2) {
+  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
+}
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                         double val1, double val2) {
+  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
+}
+
+namespace internal {
+
+// The helper function for {ASSERT|EXPECT}_EQ with int or enum
+// arguments.
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            BiggestInt lhs,
+                            BiggestInt rhs) {
+  if (lhs == rhs) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs),
+                   false);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
+// just to avoid copy-and-paste of similar code.
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   BiggestInt val1, BiggestInt val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return AssertionFailure() \
+        << "Expected: (" << expr1 << ") " #op " (" << expr2\
+        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
+        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
+  }\
+}
+
+// Implements the helper function for {ASSERT|EXPECT}_NE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(NE, !=)
+// Implements the helper function for {ASSERT|EXPECT}_LE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LE, <=)
+// Implements the helper function for {ASSERT|EXPECT}_LT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LT, < )
+// Implements the helper function for {ASSERT|EXPECT}_GE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GE, >=)
+// Implements the helper function for {ASSERT|EXPECT}_GT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GT, > )
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const char* lhs,
+                               const char* rhs) {
+  if (String::CStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   false);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const char* lhs,
+                                   const char* rhs) {
+  if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   true);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const char* s1,
+                               const char* s2) {
+  if (!String::CStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                              << s2_expression << "), actual: \""
+                              << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                   const char* s2_expression,
+                                   const char* s1,
+                                   const char* s2) {
+  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure()
+        << "Expected: (" << s1_expression << ") != ("
+        << s2_expression << ") (ignoring case), actual: \""
+        << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+}  // namespace internal
+
+namespace {
+
+// Helper functions for implementing IsSubString() and IsNotSubstring().
+
+// This group of overloaded functions return true iff needle is a
+// substring of haystack.  NULL is considered a substring of itself
+// only.
+
+bool IsSubstringPred(const char* needle, const char* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return strstr(haystack, needle) != NULL;
+}
+
+bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return wcsstr(haystack, needle) != NULL;
+}
+
+// StringType here can be either ::std::string or ::std::wstring.
+template <typename StringType>
+bool IsSubstringPred(const StringType& needle,
+                     const StringType& haystack) {
+  return haystack.find(needle) != StringType::npos;
+}
+
+// This function implements either IsSubstring() or IsNotSubstring(),
+// depending on the value of the expected_to_be_substring parameter.
+// StringType here can be const char*, const wchar_t*, ::std::string,
+// or ::std::wstring.
+template <typename StringType>
+AssertionResult IsSubstringImpl(
+    bool expected_to_be_substring,
+    const char* needle_expr, const char* haystack_expr,
+    const StringType& needle, const StringType& haystack) {
+  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
+    return AssertionSuccess();
+
+  const bool is_wide_string = sizeof(needle[0]) > 1;
+  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
+  return AssertionFailure()
+      << "Value of: " << needle_expr << "\n"
+      << "  Actual: " << begin_string_quote << needle << "\"\n"
+      << "Expected: " << (expected_to_be_substring ? "" : "not ")
+      << "a substring of " << haystack_expr << "\n"
+      << "Which is: " << begin_string_quote << haystack << "\"";
+}
+
+}  // namespace
+
+// IsSubstring() and IsNotSubstring() check whether needle is a
+// substring of haystack (NULL is considered a substring of itself
+// only), and return an appropriate error message when they fail.
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+#if GTEST_HAS_STD_WSTRING
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+
+namespace {
+
+// Helper function for IsHRESULT{SuccessFailure} predicates
+AssertionResult HRESULTFailureHelper(const char* expr,
+                                     const char* expected,
+                                     long hr) {  // NOLINT
+# if GTEST_OS_WINDOWS_MOBILE
+
+  // Windows CE doesn't support FormatMessage.
+  const char error_text[] = "";
+
+# else
+
+  // Looks up the human-readable system message for the HRESULT code
+  // and since we're not passing any params to FormatMessage, we don't
+  // want inserts expanded.
+  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
+                       FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kBufSize = 4096;
+  // Gets the system's human readable message string for this HRESULT.
+  char error_text[kBufSize] = { '\0' };
+  DWORD message_length = ::FormatMessageA(kFlags,
+                                          0,  // no source, we're asking system
+                                          hr,  // the error
+                                          0,  // no line width restrictions
+                                          error_text,  // output buffer
+                                          kBufSize,  // buf size
+                                          NULL);  // no arguments for inserts
+  // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
+  for (; message_length && IsSpace(error_text[message_length - 1]);
+          --message_length) {
+    error_text[message_length - 1] = '\0';
+  }
+
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+  const std::string error_hex("0x" + String::FormatHexInt(hr));
+  return ::testing::AssertionFailure()
+      << "Expected: " << expr << " " << expected << ".\n"
+      << "  Actual: " << error_hex << " " << error_text << "\n";
+}
+
+}  // namespace
+
+AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
+  if (SUCCEEDED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "succeeds", hr);
+}
+
+AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
+  if (FAILED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "fails", hr);
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+// Utility functions for encoding Unicode text (wide strings) in
+// UTF-8.
+
+// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
+// like this:
+//
+// Code-point length   Encoding
+//   0 -  7 bits       0xxxxxxx
+//   8 - 11 bits       110xxxxx 10xxxxxx
+//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
+//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+// The maximum code-point a one-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
+
+// The maximum code-point a two-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
+
+// The maximum code-point a three-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
+
+// The maximum code-point a four-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
+
+// Chops off the n lowest bits from a bit pattern.  Returns the n
+// lowest bits.  As a side effect, the original bit pattern will be
+// shifted to the right by n bits.
+inline UInt32 ChopLowBits(UInt32* bits, int n) {
+  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
+  *bits >>= n;
+  return low_bits;
+}
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+std::string CodePointToUtf8(UInt32 code_point) {
+  if (code_point > kMaxCodePoint4) {
+    return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")";
+  }
+
+  char str[5];  // Big enough for the largest valid code point.
+  if (code_point <= kMaxCodePoint1) {
+    str[1] = '\0';
+    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+  } else if (code_point <= kMaxCodePoint2) {
+    str[2] = '\0';
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
+  } else if (code_point <= kMaxCodePoint3) {
+    str[3] = '\0';
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
+  } else {  // code_point <= kMaxCodePoint4
+    str[4] = '\0';
+    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
+  }
+  return str;
+}
+
+// The following two functions only make sense if the the system
+// uses UTF-16 for wide string encoding. All supported systems
+// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
+
+// Determines if the arguments constitute UTF-16 surrogate pair
+// and thus should be combined into a single Unicode code point
+// using CreateCodePointFromUtf16SurrogatePair.
+inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
+  return sizeof(wchar_t) == 2 &&
+      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+}
+
+// Creates a Unicode code point from UTF16 surrogate pair.
+inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+                                                    wchar_t second) {
+  const UInt32 mask = (1 << 10) - 1;
+  return (sizeof(wchar_t) == 2) ?
+      (((first & mask) << 10) | (second & mask)) + 0x10000 :
+      // This function should not be called when the condition is
+      // false, but we provide a sensible default in case it is.
+      static_cast<UInt32>(first);
+}
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
+  if (num_chars == -1)
+    num_chars = static_cast<int>(wcslen(str));
+
+  ::std::stringstream stream;
+  for (int i = 0; i < num_chars; ++i) {
+    UInt32 unicode_code_point;
+
+    if (str[i] == L'\0') {
+      break;
+    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
+      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
+                                                                 str[i + 1]);
+      i++;
+    } else {
+      unicode_code_point = static_cast<UInt32>(str[i]);
+    }
+
+    stream << CodePointToUtf8(unicode_code_point);
+  }
+  return StringStreamToString(&stream);
+}
+
+// Converts a wide C string to an std::string using the UTF-8 encoding.
+// NULL will be converted to "(null)".
+std::string String::ShowWideCString(const wchar_t * wide_c_str) {
+  if (wide_c_str == NULL)  return "(null)";
+
+  return internal::WideStringToUtf8(wide_c_str, -1);
+}
+
+// Compares two wide C strings.  Returns true iff they have the same
+// content.
+//
+// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+  return wcscmp(lhs, rhs) == 0;
+}
+
+// Helper function for *_STREQ on wide strings.
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const wchar_t* lhs,
+                               const wchar_t* rhs) {
+  if (String::WideCStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   false);
+}
+
+// Helper function for *_STRNE on wide strings.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const wchar_t* s1,
+                               const wchar_t* s2) {
+  if (!String::WideCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  }
+
+  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                            << s2_expression << "), actual: "
+                            << PrintToString(s1)
+                            << " vs " << PrintToString(s2);
+}
+
+// Compares two C strings, ignoring case.  Returns true iff they have
+// the same content.
+//
+// Unlike strcasecmp(), this function can handle NULL argument(s).  A
+// NULL C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
+  if (lhs == NULL)
+    return rhs == NULL;
+  if (rhs == NULL)
+    return false;
+  return posix::StrCaseCmp(lhs, rhs) == 0;
+}
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                              const wchar_t* rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+#if GTEST_OS_WINDOWS
+  return _wcsicmp(lhs, rhs) == 0;
+#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
+  return wcscasecmp(lhs, rhs) == 0;
+#else
+  // Android, Mac OS X and Cygwin don't define wcscasecmp.
+  // Other unknown OSes may not define it either.
+  wint_t left, right;
+  do {
+    left = towlower(*lhs++);
+    right = towlower(*rhs++);
+  } while (left && left == right);
+  return left == right;
+#endif  // OS selector
+}
+
+// Returns true iff str ends with the given suffix, ignoring case.
+// Any string is considered to end with an empty suffix.
+bool String::EndsWithCaseInsensitive(
+    const std::string& str, const std::string& suffix) {
+  const size_t str_len = str.length();
+  const size_t suffix_len = suffix.length();
+  return (str_len >= suffix_len) &&
+         CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
+                                      suffix.c_str());
+}
+
+// Formats an int value as "%02d".
+std::string String::FormatIntWidth2(int value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << value;
+  return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexInt(int value) {
+  std::stringstream ss;
+  ss << std::hex << std::uppercase << value;
+  return ss.str();
+}
+
+// Formats a byte as "%02X".
+std::string String::FormatByte(unsigned char value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
+     << static_cast<unsigned int>(value);
+  return ss.str();
+}
+
+// Converts the buffer in a stringstream to an std::string, converting NUL
+// bytes to "\\0" along the way.
+std::string StringStreamToString(::std::stringstream* ss) {
+  const ::std::string& str = ss->str();
+  const char* const start = str.c_str();
+  const char* const end = start + str.length();
+
+  std::string result;
+  result.reserve(2 * (end - start));
+  for (const char* ch = start; ch != end; ++ch) {
+    if (*ch == '\0') {
+      result += "\\0";  // Replaces NUL with "\\0";
+    } else {
+      result += *ch;
+    }
+  }
+
+  return result;
+}
+
+// Appends the user-supplied message to the Google-Test-generated message.
+std::string AppendUserMessage(const std::string& gtest_msg,
+                              const Message& user_msg) {
+  // Appends the user message if it's non-empty.
+  const std::string user_msg_string = user_msg.GetString();
+  if (user_msg_string.empty()) {
+    return gtest_msg;
+  }
+
+  return gtest_msg + "\n" + user_msg_string;
+}
+
+}  // namespace internal
+
+// class TestResult
+
+// Creates an empty TestResult.
+TestResult::TestResult()
+    : death_test_count_(0),
+      elapsed_time_(0) {
+}
+
+// D'tor.
+TestResult::~TestResult() {
+}
+
+// Returns the i-th test part result among all the results. i can
+// range from 0 to total_part_count() - 1. If i is not in that range,
+// aborts the program.
+const TestPartResult& TestResult::GetTestPartResult(int i) const {
+  if (i < 0 || i >= total_part_count())
+    internal::posix::Abort();
+  return test_part_results_.at(i);
+}
+
+// Returns the i-th test property. i can range from 0 to
+// test_property_count() - 1. If i is not in that range, aborts the
+// program.
+const TestProperty& TestResult::GetTestProperty(int i) const {
+  if (i < 0 || i >= test_property_count())
+    internal::posix::Abort();
+  return test_properties_.at(i);
+}
+
+// Clears the test part results.
+void TestResult::ClearTestPartResults() {
+  test_part_results_.clear();
+}
+
+// Adds a test part result to the list.
+void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
+  test_part_results_.push_back(test_part_result);
+}
+
+// Adds a test property to the list. If a property with the same key as the
+// supplied property is already represented, the value of this test_property
+// replaces the old value for that key.
+void TestResult::RecordProperty(const std::string& xml_element,
+                                const TestProperty& test_property) {
+  if (!ValidateTestProperty(xml_element, test_property)) {
+    return;
+  }
+  internal::MutexLock lock(&test_properites_mutex_);
+  const std::vector<TestProperty>::iterator property_with_matching_key =
+      std::find_if(test_properties_.begin(), test_properties_.end(),
+                   internal::TestPropertyKeyIs(test_property.key()));
+  if (property_with_matching_key == test_properties_.end()) {
+    test_properties_.push_back(test_property);
+    return;
+  }
+  property_with_matching_key->SetValue(test_property.value());
+}
+
+// The list of reserved attributes used in the <testsuites> element of XML
+// output.
+static const char* const kReservedTestSuitesAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "random_seed",
+  "tests",
+  "time",
+  "timestamp"
+};
+
+// The list of reserved attributes used in the <testsuite> element of XML
+// output.
+static const char* const kReservedTestSuiteAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "tests",
+  "time"
+};
+
+// The list of reserved attributes used in the <testcase> element of XML output.
+static const char* const kReservedTestCaseAttributes[] = {
+  "classname",
+  "name",
+  "status",
+  "time",
+  "type_param",
+  "value_param"
+};
+
+template <int kSize>
+std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
+  return std::vector<std::string>(array, array + kSize);
+}
+
+static std::vector<std::string> GetReservedAttributesForElement(
+    const std::string& xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+static std::string FormatWordList(const std::vector<std::string>& words) {
+  Message word_list;
+  for (size_t i = 0; i < words.size(); ++i) {
+    if (i > 0 && words.size() > 2) {
+      word_list << ", ";
+    }
+    if (i == words.size() - 1) {
+      word_list << "and ";
+    }
+    word_list << "'" << words[i] << "'";
+  }
+  return word_list.GetString();
+}
+
+bool ValidateTestPropertyName(const std::string& property_name,
+                              const std::vector<std::string>& reserved_names) {
+  if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
+          reserved_names.end()) {
+    ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
+                  << " (" << FormatWordList(reserved_names)
+                  << " are reserved by " << GTEST_NAME_ << ")";
+    return false;
+  }
+  return true;
+}
+
+// Adds a failure if the key is a reserved attribute of the element named
+// xml_element.  Returns true if the property is valid.
+bool TestResult::ValidateTestProperty(const std::string& xml_element,
+                                      const TestProperty& test_property) {
+  return ValidateTestPropertyName(test_property.key(),
+                                  GetReservedAttributesForElement(xml_element));
+}
+
+// Clears the object.
+void TestResult::Clear() {
+  test_part_results_.clear();
+  test_properties_.clear();
+  death_test_count_ = 0;
+  elapsed_time_ = 0;
+}
+
+// Returns true iff the test failed.
+bool TestResult::Failed() const {
+  for (int i = 0; i < total_part_count(); ++i) {
+    if (GetTestPartResult(i).failed())
+      return true;
+  }
+  return false;
+}
+
+// Returns true iff the test part fatally failed.
+static bool TestPartFatallyFailed(const TestPartResult& result) {
+  return result.fatally_failed();
+}
+
+// Returns true iff the test fatally failed.
+bool TestResult::HasFatalFailure() const {
+  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
+}
+
+// Returns true iff the test part non-fatally failed.
+static bool TestPartNonfatallyFailed(const TestPartResult& result) {
+  return result.nonfatally_failed();
+}
+
+// Returns true iff the test has a non-fatal failure.
+bool TestResult::HasNonfatalFailure() const {
+  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
+}
+
+// Gets the number of all test parts.  This is the sum of the number
+// of successful test parts and the number of failed test parts.
+int TestResult::total_part_count() const {
+  return static_cast<int>(test_part_results_.size());
+}
+
+// Returns the number of the test properties.
+int TestResult::test_property_count() const {
+  return static_cast<int>(test_properties_.size());
+}
+
+// class Test
+
+// Creates a Test object.
+
+// The c'tor saves the states of all flags.
+Test::Test()
+    : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
+}
+
+// The d'tor restores the states of all flags.  The actual work is
+// done by the d'tor of the gtest_flag_saver_ field, and thus not
+// visible here.
+Test::~Test() {
+}
+
+// Sets up the test fixture.
+//
+// A sub-class may override this.
+void Test::SetUp() {
+}
+
+// Tears down the test fixture.
+//
+// A sub-class may override this.
+void Test::TearDown() {
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, const std::string& value) {
+  UnitTest::GetInstance()->RecordProperty(key, value);
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, int value) {
+  Message value_message;
+  value_message << value;
+  RecordProperty(key, value_message.GetString().c_str());
+}
+
+namespace internal {
+
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message) {
+  // This function is a friend of UnitTest and as such has access to
+  // AddTestPartResult.
+  UnitTest::GetInstance()->AddTestPartResult(
+      result_type,
+      NULL,  // No info about the source file where the exception occurred.
+      -1,    // We have no info on which line caused the exception.
+      message,
+      "");   // No stack trace, either.
+}
+
+}  // namespace internal
+
+// Google Test requires all tests in the same test case to use the same test
+// fixture class.  This function checks if the current test has the
+// same fixture class as the first test in the current test case.  If
+// yes, it returns true; otherwise it generates a Google Test failure and
+// returns false.
+bool Test::HasSameFixtureClass() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  const TestCase* const test_case = impl->current_test_case();
+
+  // Info about the first test in the current test case.
+  const TestInfo* const first_test_info = test_case->test_info_list()[0];
+  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
+  const char* const first_test_name = first_test_info->name();
+
+  // Info about the current test.
+  const TestInfo* const this_test_info = impl->current_test_info();
+  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
+  const char* const this_test_name = this_test_info->name();
+
+  if (this_fixture_id != first_fixture_id) {
+    // Is the first test defined using TEST?
+    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
+    // Is this test defined using TEST?
+    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
+
+    if (first_is_TEST || this_is_TEST) {
+      // Both TEST and TEST_F appear in same test case, which is incorrect.
+      // Tell the user how to fix this.
+
+      // Gets the name of the TEST and the name of the TEST_F.  Note
+      // that first_is_TEST and this_is_TEST cannot both be true, as
+      // the fixture IDs are different for the two tests.
+      const char* const TEST_name =
+          first_is_TEST ? first_test_name : this_test_name;
+      const char* const TEST_F_name =
+          first_is_TEST ? this_test_name : first_test_name;
+
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class, so mixing TEST_F and TEST in the same test case is\n"
+          << "illegal.  In test case " << this_test_info->test_case_name()
+          << ",\n"
+          << "test " << TEST_F_name << " is defined using TEST_F but\n"
+          << "test " << TEST_name << " is defined using TEST.  You probably\n"
+          << "want to change the TEST to TEST_F or move it to another test\n"
+          << "case.";
+    } else {
+      // Two fixture classes with the same name appear in two different
+      // namespaces, which is not allowed. Tell the user how to fix this.
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class.  However, in test case "
+          << this_test_info->test_case_name() << ",\n"
+          << "you defined test " << first_test_name
+          << " and test " << this_test_name << "\n"
+          << "using two different test fixture classes.  This can happen if\n"
+          << "the two classes are from different namespaces or translation\n"
+          << "units and have the same name.  You should probably rename one\n"
+          << "of the classes to put the tests into different test cases.";
+    }
+    return false;
+  }
+
+  return true;
+}
+
+#if GTEST_HAS_SEH
+
+// Adds an "exception thrown" fatal failure to the current test.  This
+// function returns its result via an output parameter pointer because VC++
+// prohibits creation of objects with destructors on stack in functions
+// using __try (see error C2712).
+static std::string* FormatSehExceptionMessage(DWORD exception_code,
+                                              const char* location) {
+  Message message;
+  message << "SEH exception with code 0x" << std::setbase(16) <<
+    exception_code << std::setbase(10) << " thrown in " << location << ".";
+
+  return new std::string(message.GetString());
+}
+
+#endif  // GTEST_HAS_SEH
+
+namespace internal {
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Adds an "exception thrown" fatal failure to the current test.
+static std::string FormatCxxExceptionMessage(const char* description,
+                                             const char* location) {
+  Message message;
+  if (description != NULL) {
+    message << "C++ exception with description \"" << description << "\"";
+  } else {
+    message << "Unknown C++ exception";
+  }
+  message << " thrown in " << location << ".";
+
+  return message.GetString();
+}
+
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result);
+
+GoogleTestFailureException::GoogleTestFailureException(
+    const TestPartResult& failure)
+    : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// We put these helper functions in the internal namespace as IBM's xlC
+// compiler rejects the code if they were declared static.
+
+// Runs the given method and handles SEH exceptions it throws, when
+// SEH is supported; returns the 0-value for type Result in case of an
+// SEH exception.  (Microsoft compilers cannot handle SEH and C++
+// exceptions in the same function.  Therefore, we provide a separate
+// wrapper function for handling SEH exceptions.)
+template <class T, typename Result>
+Result HandleSehExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+#if GTEST_HAS_SEH
+  __try {
+    return (object->*method)();
+  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
+      GetExceptionCode())) {
+    // We create the exception message on the heap because VC++ prohibits
+    // creation of objects with destructors on stack in functions using __try
+    // (see error C2712).
+    std::string* exception_message = FormatSehExceptionMessage(
+        GetExceptionCode(), location);
+    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
+                                             *exception_message);
+    delete exception_message;
+    return static_cast<Result>(0);
+  }
+#else
+  (void)location;
+  return (object->*method)();
+#endif  // GTEST_HAS_SEH
+}
+
+// Runs the given method and catches and reports C++ and/or SEH-style
+// exceptions, if they are supported; returns the 0-value for type
+// Result in case of an SEH exception.
+template <class T, typename Result>
+Result HandleExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+  // NOTE: The user code can affect the way in which Google Test handles
+  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
+  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
+  // after the exception is caught and either report or re-throw the
+  // exception based on the flag's value:
+  //
+  // try {
+  //   // Perform the test method.
+  // } catch (...) {
+  //   if (GTEST_FLAG(catch_exceptions))
+  //     // Report the exception as failure.
+  //   else
+  //     throw;  // Re-throws the original exception.
+  // }
+  //
+  // However, the purpose of this flag is to allow the program to drop into
+  // the debugger when the exception is thrown. On most platforms, once the
+  // control enters the catch block, the exception origin information is
+  // lost and the debugger will stop the program at the point of the
+  // re-throw in this function -- instead of at the point of the original
+  // throw statement in the code under test.  For this reason, we perform
+  // the check early, sacrificing the ability to affect Google Test's
+  // exception handling in the method where the exception is thrown.
+  if (internal::GetUnitTestImpl()->catch_exceptions()) {
+#if GTEST_HAS_EXCEPTIONS
+    try {
+      return HandleSehExceptionsInMethodIfSupported(object, method, location);
+    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
+      // This exception type can only be thrown by a failed Google
+      // Test assertion with the intention of letting another testing
+      // framework catch it.  Therefore we just re-throw it.
+      throw;
+    } catch (const std::exception& e) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(e.what(), location));
+    } catch (...) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(NULL, location));
+    }
+    return static_cast<Result>(0);
+#else
+    return HandleSehExceptionsInMethodIfSupported(object, method, location);
+#endif  // GTEST_HAS_EXCEPTIONS
+  } else {
+    return (object->*method)();
+  }
+}
+
+}  // namespace internal
+
+// Runs the test and updates the test result.
+void Test::Run() {
+  if (!HasSameFixtureClass()) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
+  // We will run the test only if SetUp() was successful.
+  if (!HasFatalFailure()) {
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        this, &Test::TestBody, "the test body");
+  }
+
+  // However, we want to clean up as much as possible.  Hence we will
+  // always call TearDown(), even if SetUp() or the test body has
+  // failed.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &Test::TearDown, "TearDown()");
+}
+
+// Returns true iff the current test has a fatal failure.
+bool Test::HasFatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
+}
+
+// Returns true iff the current test has a non-fatal failure.
+bool Test::HasNonfatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->
+      HasNonfatalFailure();
+}
+
+// class TestInfo
+
+// Constructs a TestInfo object. It assumes ownership of the test factory
+// object.
+TestInfo::TestInfo(const std::string& a_test_case_name,
+                   const std::string& a_name,
+                   const char* a_type_param,
+                   const char* a_value_param,
+                   internal::CodeLocation a_code_location,
+                   internal::TypeId fixture_class_id,
+                   internal::TestFactoryBase* factory)
+    : test_case_name_(a_test_case_name),
+      name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      value_param_(a_value_param ? new std::string(a_value_param) : NULL),
+      location_(a_code_location),
+      fixture_class_id_(fixture_class_id),
+      should_run_(false),
+      is_disabled_(false),
+      matches_filter_(false),
+      factory_(factory),
+      result_() {}
+
+// Destructs a TestInfo object.
+TestInfo::~TestInfo() { delete factory_; }
+
+namespace internal {
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param:       the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param:      text representation of the test's value parameter,
+//                     or NULL if this is not a value-parameterized test.
+//   code_location:    code location where the test is defined
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name,
+    const char* name,
+    const char* type_param,
+    const char* value_param,
+    CodeLocation code_location,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory) {
+  TestInfo* const test_info =
+      new TestInfo(test_case_name, name, type_param, value_param,
+                   code_location, fixture_class_id, factory);
+  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
+  return test_info;
+}
+
+#if GTEST_HAS_PARAM_TEST
+void ReportInvalidTestCaseType(const char* test_case_name,
+                               CodeLocation code_location) {
+  Message errors;
+  errors
+      << "Attempted redefinition of test case " << test_case_name << ".\n"
+      << "All tests in the same test case must use the same test fixture\n"
+      << "class.  However, in test case " << test_case_name << ", you tried\n"
+      << "to define a test using a fixture class different from the one\n"
+      << "used earlier. This can happen if the two fixture classes are\n"
+      << "from different namespaces and have the same name. You should\n"
+      << "probably rename one of the classes to put the tests into different\n"
+      << "test cases.";
+
+  fprintf(stderr, "%s %s",
+          FormatFileLocation(code_location.file.c_str(),
+                             code_location.line).c_str(),
+          errors.GetString().c_str());
+}
+#endif  // GTEST_HAS_PARAM_TEST
+
+}  // namespace internal
+
+namespace {
+
+// A predicate that checks the test name of a TestInfo against a known
+// value.
+//
+// This is used for implementation of the TestCase class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestNameIs is copyable.
+class TestNameIs {
+ public:
+  // Constructor.
+  //
+  // TestNameIs has NO default constructor.
+  explicit TestNameIs(const char* name)
+      : name_(name) {}
+
+  // Returns true iff the test name of test_info matches name_.
+  bool operator()(const TestInfo * test_info) const {
+    return test_info && test_info->name() == name_;
+  }
+
+ private:
+  std::string name_;
+};
+
+}  // namespace
+
+namespace internal {
+
+// This method expands all parameterized tests registered with macros TEST_P
+// and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
+// This will be done just once during the program runtime.
+void UnitTestImpl::RegisterParameterizedTests() {
+#if GTEST_HAS_PARAM_TEST
+  if (!parameterized_tests_registered_) {
+    parameterized_test_registry_.RegisterTests();
+    parameterized_tests_registered_ = true;
+  }
+#endif
+}
+
+}  // namespace internal
+
+// Creates the test object, runs it, records its result, and then
+// deletes it.
+void TestInfo::Run() {
+  if (!should_run_) return;
+
+  // Tells UnitTest where to store test result.
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TimeInMillis start = internal::GetTimeInMillis();
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+
+  // Creates the test object.
+  Test* const test = internal::HandleExceptionsInMethodIfSupported(
+      factory_, &internal::TestFactoryBase::CreateTest,
+      "the test fixture's constructor");
+
+  // Runs the test only if the test object was created and its
+  // constructor didn't generate a fatal failure.
+  if ((test != NULL) && !Test::HasFatalFailure()) {
+    // This doesn't throw as all user code that can throw are wrapped into
+    // exception handling code.
+    test->Run();
+  }
+
+  // Deletes the test object.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      test, &Test::DeleteSelf_, "the test fixture's destructor");
+
+  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+
+  // Tells UnitTest to stop associating assertion results to this
+  // test.
+  impl->set_current_test_info(NULL);
+}
+
+// class TestCase
+
+// Gets the number of successful tests in this test case.
+int TestCase::successful_test_count() const {
+  return CountIf(test_info_list_, TestPassed);
+}
+
+// Gets the number of failed tests in this test case.
+int TestCase::failed_test_count() const {
+  return CountIf(test_info_list_, TestFailed);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int TestCase::reportable_disabled_test_count() const {
+  return CountIf(test_info_list_, TestReportableDisabled);
+}
+
+// Gets the number of disabled tests in this test case.
+int TestCase::disabled_test_count() const {
+  return CountIf(test_info_list_, TestDisabled);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int TestCase::reportable_test_count() const {
+  return CountIf(test_info_list_, TestReportable);
+}
+
+// Get the number of tests in this test case that should run.
+int TestCase::test_to_run_count() const {
+  return CountIf(test_info_list_, ShouldRunTest);
+}
+
+// Gets the number of all tests.
+int TestCase::total_test_count() const {
+  return static_cast<int>(test_info_list_.size());
+}
+
+// Creates a TestCase with the given name.
+//
+// Arguments:
+//
+//   name:         name of the test case
+//   a_type_param: the name of the test case's type parameter, or NULL if
+//                 this is not a typed or a type-parameterized test case.
+//   set_up_tc:    pointer to the function that sets up the test case
+//   tear_down_tc: pointer to the function that tears down the test case
+TestCase::TestCase(const char* a_name, const char* a_type_param,
+                   Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc)
+    : name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      set_up_tc_(set_up_tc),
+      tear_down_tc_(tear_down_tc),
+      should_run_(false),
+      elapsed_time_(0) {
+}
+
+// Destructor of TestCase.
+TestCase::~TestCase() {
+  // Deletes every Test in the collection.
+  ForEach(test_info_list_, internal::Delete<TestInfo>);
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+const TestInfo* TestCase::GetTestInfo(int i) const {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+TestInfo* TestCase::GetMutableTestInfo(int i) {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Adds a test to this test case.  Will delete the test upon
+// destruction of the TestCase object.
+void TestCase::AddTestInfo(TestInfo * test_info) {
+  test_info_list_.push_back(test_info);
+  test_indices_.push_back(static_cast<int>(test_indices_.size()));
+}
+
+// Runs every test in this TestCase.
+void TestCase::Run() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_case(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  repeater->OnTestCaseStart(*this);
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunSetUpTestCase, "SetUpTestCase()");
+
+  const internal::TimeInMillis start = internal::GetTimeInMillis();
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Run();
+  }
+  elapsed_time_ = internal::GetTimeInMillis() - start;
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunTearDownTestCase, "TearDownTestCase()");
+
+  repeater->OnTestCaseEnd(*this);
+  impl->set_current_test_case(NULL);
+}
+
+// Clears the results of all tests in this test case.
+void TestCase::ClearResult() {
+  ad_hoc_test_result_.Clear();
+  ForEach(test_info_list_, TestInfo::ClearTestResult);
+}
+
+// Shuffles the tests in this test case.
+void TestCase::ShuffleTests(internal::Random* random) {
+  Shuffle(random, &test_indices_);
+}
+
+// Restores the test order to before the first shuffle.
+void TestCase::UnshuffleTests() {
+  for (size_t i = 0; i < test_indices_.size(); i++) {
+    test_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Formats a countable noun.  Depending on its quantity, either the
+// singular form or the plural form is used. e.g.
+//
+// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
+// FormatCountableNoun(5, "book", "books") returns "5 books".
+static std::string FormatCountableNoun(int count,
+                                       const char * singular_form,
+                                       const char * plural_form) {
+  return internal::StreamableToString(count) + " " +
+      (count == 1 ? singular_form : plural_form);
+}
+
+// Formats the count of tests.
+static std::string FormatTestCount(int test_count) {
+  return FormatCountableNoun(test_count, "test", "tests");
+}
+
+// Formats the count of test cases.
+static std::string FormatTestCaseCount(int test_case_count) {
+  return FormatCountableNoun(test_case_count, "test case", "test cases");
+}
+
+// Converts a TestPartResult::Type enum to human-friendly string
+// representation.  Both kNonFatalFailure and kFatalFailure are translated
+// to "Failure", as the user usually doesn't care about the difference
+// between the two when viewing the test result.
+static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+  switch (type) {
+    case TestPartResult::kSuccess:
+      return "Success";
+
+    case TestPartResult::kNonFatalFailure:
+    case TestPartResult::kFatalFailure:
+#ifdef _MSC_VER
+      return "error: ";
+#else
+      return "Failure\n";
+#endif
+    default:
+      return "Unknown result type";
+  }
+}
+
+namespace internal {
+
+// Prints a TestPartResult to an std::string.
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result) {
+  return (Message()
+          << internal::FormatFileLocation(test_part_result.file_name(),
+                                          test_part_result.line_number())
+          << " " << TestPartResultTypeToString(test_part_result.type())
+          << test_part_result.message()).GetString();
+}
+
+// Prints a TestPartResult.
+static void PrintTestPartResult(const TestPartResult& test_part_result) {
+  const std::string& result =
+      PrintTestPartResultToString(test_part_result);
+  printf("%s\n", result.c_str());
+  fflush(stdout);
+  // If the test program runs in Visual Studio or a debugger, the
+  // following statements add the test part result message to the Output
+  // window such that the user can double-click on it to jump to the
+  // corresponding source code location; otherwise they do nothing.
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  // We don't call OutputDebugString*() on Windows Mobile, as printing
+  // to stdout is done by OutputDebugString() there already - we don't
+  // want the same message printed twice.
+  ::OutputDebugStringA(result.c_str());
+  ::OutputDebugStringA("\n");
+#endif
+}
+
+// class PrettyUnitTestResultPrinter
+
+enum GTestColor {
+  COLOR_DEFAULT,
+  COLOR_RED,
+  COLOR_GREEN,
+  COLOR_YELLOW
+};
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+// Returns the character attribute for the given color.
+WORD GetColorAttribute(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:    return FOREGROUND_RED;
+    case COLOR_GREEN:  return FOREGROUND_GREEN;
+    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
+    default:           return 0;
+  }
+}
+
+#else
+
+// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
+// an invalid input.
+const char* GetAnsiColorCode(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:     return "1";
+    case COLOR_GREEN:   return "2";
+    case COLOR_YELLOW:  return "3";
+    default:            return NULL;
+  };
+}
+
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns true iff Google Test should use colors in the output.
+bool ShouldUseColor(bool stdout_is_tty) {
+  const char* const gtest_color = GTEST_FLAG(color).c_str();
+
+  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
+#if GTEST_OS_WINDOWS
+    // On Windows the TERM variable is usually not set, but the
+    // console there does support colors.
+    return stdout_is_tty;
+#else
+    // On non-Windows platforms, we rely on the TERM variable.
+    const char* const term = posix::GetEnv("TERM");
+    const bool term_supports_color =
+        String::CStringEquals(term, "xterm") ||
+        String::CStringEquals(term, "xterm-color") ||
+        String::CStringEquals(term, "xterm-256color") ||
+        String::CStringEquals(term, "screen") ||
+        String::CStringEquals(term, "screen-256color") ||
+        String::CStringEquals(term, "tmux") ||
+        String::CStringEquals(term, "tmux-256color") ||
+        String::CStringEquals(term, "rxvt-unicode") ||
+        String::CStringEquals(term, "rxvt-unicode-256color") ||
+        String::CStringEquals(term, "linux") ||
+        String::CStringEquals(term, "cygwin");
+    return stdout_is_tty && term_supports_color;
+#endif  // GTEST_OS_WINDOWS
+  }
+
+  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+      String::CStringEquals(gtest_color, "1");
+  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
+  // value is neither one of these nor "auto", we treat it as "no" to
+  // be conservative.
+}
+
+// Helpers for printing colored strings to stdout. Note that on Windows, we
+// cannot simply emit special characters and have the terminal change colors.
+// This routine must actually emit the characters rather than return a string
+// that would be colored when printed, as can be done on Linux.
+void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || \
+    GTEST_OS_IOS || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+  const bool use_color = AlwaysFalse();
+#else
+  static const bool in_color_mode =
+      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
+  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
+#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
+  // The '!= 0' comparison is necessary to satisfy MSVC 7.1.
+
+  if (!use_color) {
+    vprintf(fmt, args);
+    va_end(args);
+    return;
+  }
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle,
+                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  printf("\033[0;3%sm", GetAnsiColorCode(color));
+  vprintf(fmt, args);
+  printf("\033[m");  // Resets the terminal to default.
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  va_end(args);
+}
+
+// Text printed in Google Test's text output and --gunit_list_tests
+// output to label the type parameter and value parameter for a test.
+static const char kTypeParamLabel[] = "TypeParam";
+static const char kValueParamLabel[] = "GetParam()";
+
+void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+  const char* const type_param = test_info.type_param();
+  const char* const value_param = test_info.value_param();
+
+  if (type_param != NULL || value_param != NULL) {
+    printf(", where ");
+    if (type_param != NULL) {
+      printf("%s = %s", kTypeParamLabel, type_param);
+      if (value_param != NULL)
+        printf(" and ");
+    }
+    if (value_param != NULL) {
+      printf("%s = %s", kValueParamLabel, value_param);
+    }
+  }
+}
+
+// This class implements the TestEventListener interface.
+//
+// Class PrettyUnitTestResultPrinter is copyable.
+class PrettyUnitTestResultPrinter : public TestEventListener {
+ public:
+  PrettyUnitTestResultPrinter() {}
+  static void PrintTestName(const char * test_case, const char * test) {
+    printf("%s.%s", test_case, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+
+ private:
+  static void PrintFailedTests(const UnitTest& unit_test);
+};
+
+  // Fired before each iteration of tests starts.
+void PrettyUnitTestResultPrinter::OnTestIterationStart(
+    const UnitTest& unit_test, int iteration) {
+  if (GTEST_FLAG(repeat) != 1)
+    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
+
+  const char* const filter = GTEST_FLAG(filter).c_str();
+
+  // Prints the filter if it's not *.  This reminds the user that some
+  // tests may be skipped.
+  if (!String::CStringEquals(filter, kUniversalFilter)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
+  }
+
+  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
+    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: This is test shard %d of %s.\n",
+                  static_cast<int>(shard_index) + 1,
+                  internal::posix::GetEnv(kTestTotalShards));
+  }
+
+  if (GTEST_FLAG(shuffle)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: Randomizing tests' orders with a seed of %d .\n",
+                  unit_test.random_seed());
+  }
+
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("Running %s from %s.\n",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment set-up.\n");
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_case.name());
+  if (test_case.type_param() == NULL) {
+    printf("\n");
+  } else {
+    printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
+  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
+  PrintTestName(test_info.test_case_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
+// Called after an assertion failure.
+void PrettyUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  // If the test part succeeded, we don't need to do anything.
+  if (result.type() == TestPartResult::kSuccess)
+    return;
+
+  // Print failure message from the assertion (e.g. expected this and got that).
+  PrintTestPartResult(result);
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Passed()) {
+    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+  } else {
+    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+  }
+  PrintTestName(test_info.test_case_name(), test_info.name());
+  if (test_info.result()->Failed())
+    PrintFullTestCommentIfPresent(test_info);
+
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms)\n", internal::StreamableToString(
+           test_info.result()->elapsed_time()).c_str());
+  } else {
+    printf("\n");
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
+  if (!GTEST_FLAG(print_time)) return;
+
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n",
+         counts.c_str(), test_case.name(),
+         internal::StreamableToString(test_case.elapsed_time()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment tear-down\n");
+  fflush(stdout);
+}
+
+// Internal helper for printing the list of failed tests.
+void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
+  const int failed_test_count = unit_test.failed_test_count();
+  if (failed_test_count == 0) {
+    return;
+  }
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    const TestCase& test_case = *unit_test.GetTestCase(i);
+    if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_case.total_test_count(); ++j) {
+      const TestInfo& test_info = *test_case.GetTestInfo(j);
+      if (!test_info.should_run() || test_info.result()->Passed()) {
+        continue;
+      }
+      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      printf("%s.%s", test_case.name(), test_info.name());
+      PrintFullTestCommentIfPresent(test_info);
+      printf("\n");
+    }
+  }
+}
+
+void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                     int /*iteration*/) {
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  int num_failures = unit_test.failed_test_count();
+  if (!unit_test.Passed()) {
+    const int failed_test_count = unit_test.failed_test_count();
+    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
+    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
+    PrintFailedTests(unit_test);
+    printf("\n%2d FAILED %s\n", num_failures,
+                        num_failures == 1 ? "TEST" : "TESTS");
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+    if (!num_failures) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(COLOR_YELLOW,
+                  "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled,
+                  num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End PrettyUnitTestResultPrinter
+
+// class TestEventRepeater
+//
+// This class forwards events to other event listeners.
+class TestEventRepeater : public TestEventListener {
+ public:
+  TestEventRepeater() : forwarding_enabled_(true) {}
+  virtual ~TestEventRepeater();
+  void Append(TestEventListener *listener);
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled() const { return forwarding_enabled_; }
+  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
+
+  virtual void OnTestProgramStart(const UnitTest& unit_test);
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test);
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test);
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& unit_test);
+
+ private:
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled_;
+  // The list of listeners that receive events.
+  std::vector<TestEventListener*> listeners_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+};
+
+TestEventRepeater::~TestEventRepeater() {
+  ForEach(listeners_, Delete<TestEventListener>);
+}
+
+void TestEventRepeater::Append(TestEventListener *listener) {
+  listeners_.push_back(listener);
+}
+
+// TODO(vladl@google.com): Factor the search functionality into Vector::Find.
+TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+  for (size_t i = 0; i < listeners_.size(); ++i) {
+    if (listeners_[i] == listener) {
+      listeners_.erase(listeners_.begin() + i);
+      return listener;
+    }
+  }
+
+  return NULL;
+}
+
+// Since most methods are very similar, use macros to reduce boilerplate.
+// This defines a member that forwards the call to all listeners.
+#define GTEST_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (size_t i = 0; i < listeners_.size(); i++) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+// This defines a member that forwards the call to all listeners in reverse
+// order.
+#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+
+GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase)
+GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
+GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
+
+#undef GTEST_REPEATER_METHOD_
+#undef GTEST_REVERSE_REPEATER_METHOD_
+
+void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
+                                             int iteration) {
+  if (forwarding_enabled_) {
+    for (size_t i = 0; i < listeners_.size(); i++) {
+      listeners_[i]->OnTestIterationStart(unit_test, iteration);
+    }
+  }
+}
+
+void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
+                                           int iteration) {
+  if (forwarding_enabled_) {
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) {
+      listeners_[i]->OnTestIterationEnd(unit_test, iteration);
+    }
+  }
+}
+
+// End TestEventRepeater
+
+// This class generates an XML output file.
+class XmlUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit XmlUnitTestResultPrinter(const char* output_file);
+
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+
+ private:
+  // Is c a whitespace character that is normalized to a space character
+  // when it appears in an XML attribute value?
+  static bool IsNormalizableWhitespace(char c) {
+    return c == 0x9 || c == 0xA || c == 0xD;
+  }
+
+  // May c appear in a well-formed XML document?
+  static bool IsValidXmlCharacter(char c) {
+    return IsNormalizableWhitespace(c) || c >= 0x20;
+  }
+
+  // Returns an XML-escaped copy of the input string str.  If
+  // is_attribute is true, the text is meant to appear as an attribute
+  // value, and normalizable whitespace is preserved by replacing it
+  // with character references.
+  static std::string EscapeXml(const std::string& str, bool is_attribute);
+
+  // Returns the given string with all characters invalid in XML removed.
+  static std::string RemoveInvalidXmlCharacters(const std::string& str);
+
+  // Convenience wrapper around EscapeXml when str is an attribute value.
+  static std::string EscapeXmlAttribute(const std::string& str) {
+    return EscapeXml(str, true);
+  }
+
+  // Convenience wrapper around EscapeXml when str is not an attribute value.
+  static std::string EscapeXmlText(const char* str) {
+    return EscapeXml(str, false);
+  }
+
+  // Verifies that the given attribute belongs to the given element and
+  // streams the attribute as XML.
+  static void OutputXmlAttribute(std::ostream* stream,
+                                 const std::string& element_name,
+                                 const std::string& name,
+                                 const std::string& value);
+
+  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
+
+  // Streams an XML representation of a TestInfo object.
+  static void OutputXmlTestInfo(::std::ostream* stream,
+                                const char* test_case_name,
+                                const TestInfo& test_info);
+
+  // Prints an XML representation of a TestCase object
+  static void PrintXmlTestCase(::std::ostream* stream,
+                               const TestCase& test_case);
+
+  // Prints an XML summary of unit_test to output stream out.
+  static void PrintXmlUnitTest(::std::ostream* stream,
+                               const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as space
+  // delimited XML attributes based on the property key="value" pairs.
+  // When the std::string is not empty, it includes a space at the beginning,
+  // to delimit this attribute from prior attributes.
+  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
+
+  // The output file.
+  const std::string output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+};
+
+// Creates a new XmlUnitTestResultPrinter.
+XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.c_str() == NULL || output_file_.empty()) {
+    fprintf(stderr, "XML output file may not be null\n");
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+}
+
+// Called after the unit test ends.
+void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* xmlout = NULL;
+  FilePath output_file(output_file_);
+  FilePath output_dir(output_file.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    xmlout = posix::FOpen(output_file_.c_str(), "w");
+  }
+  if (xmlout == NULL) {
+    // TODO(wan): report the reason of the failure.
+    //
+    // We don't do it for now as:
+    //
+    //   1. There is no urgent need for it.
+    //   2. It's a bit involved to make the errno variable thread-safe on
+    //      all three operating systems (Linux, Windows, and Mac OS).
+    //   3. To interpret the meaning of errno in a thread-safe way,
+    //      we need the strerror_r() function, which is not available on
+    //      Windows.
+    fprintf(stderr,
+            "Unable to open file \"%s\"\n",
+            output_file_.c_str());
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+  std::stringstream stream;
+  PrintXmlUnitTest(&stream, unit_test);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
+// Returns an XML-escaped copy of the input string str.  If is_attribute
+// is true, the text is meant to appear as an attribute value, and
+// normalizable whitespace is preserved by replacing it with character
+// references.
+//
+// Invalid XML characters in str, if any, are stripped from the output.
+// It is expected that most, if not all, of the text processed by this
+// module will consist of ordinary English text.
+// If this module is ever modified to produce version 1.1 XML output,
+// most invalid characters can be retained using character references.
+// TODO(wan): It might be nice to have a minimally invasive, human-readable
+// escaping scheme for invalid characters, rather than dropping them.
+std::string XmlUnitTestResultPrinter::EscapeXml(
+    const std::string& str, bool is_attribute) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '<':
+        m << "&lt;";
+        break;
+      case '>':
+        m << "&gt;";
+        break;
+      case '&':
+        m << "&amp;";
+        break;
+      case '\'':
+        if (is_attribute)
+          m << "&apos;";
+        else
+          m << '\'';
+        break;
+      case '"':
+        if (is_attribute)
+          m << "&quot;";
+        else
+          m << '"';
+        break;
+      default:
+        if (IsValidXmlCharacter(ch)) {
+          if (is_attribute && IsNormalizableWhitespace(ch))
+            m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
+              << ";";
+          else
+            m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// Returns the given string with all characters invalid in XML removed.
+// Currently invalid characters are dropped from the string. An
+// alternative is to replace them with certain characters such as . or ?.
+std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
+    const std::string& str) {
+  std::string output;
+  output.reserve(str.size());
+  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
+    if (IsValidXmlCharacter(*it))
+      output.push_back(*it);
+
+  return output;
+}
+
+// The following routines generate an XML representation of a UnitTest
+// object.
+//
+// This is how Google Test concepts map to the DTD:
+//
+// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
+//   <testsuite name="testcase-name">  <-- corresponds to a TestCase object
+//     <testcase name="test-name">     <-- corresponds to a TestInfo object
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//                                     <-- individual assertion failures
+//     </testcase>
+//   </testsuite>
+// </testsuites>
+
+// Formats the given time in milliseconds as seconds.
+std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << (static_cast<double>(ms) * 1e-3);
+  return ss.str();
+}
+
+static bool PortableLocaltime(time_t seconds, struct tm* out) {
+#if defined(_MSC_VER)
+  return localtime_s(out, &seconds) == 0;
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+  // MINGW <time.h> provides neither localtime_r nor localtime_s, but uses
+  // Windows' localtime(), which has a thread-local tm buffer.
+  struct tm* tm_ptr = localtime(&seconds);  // NOLINT
+  if (tm_ptr == NULL)
+    return false;
+  *out = *tm_ptr;
+  return true;
+#else
+  return localtime_r(&seconds, out) != NULL;
+#endif
+}
+
+// Converts the given epoch time in milliseconds to a date string in the ISO
+// 8601 format, without the timezone information.
+std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct.tm_min) + ":" +
+      String::FormatIntWidth2(time_struct.tm_sec);
+}
+
+// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
+                                                     const char* data) {
+  const char* segment = data;
+  *stream << "<![CDATA[";
+  for (;;) {
+    const char* const next_segment = strstr(segment, "]]>");
+    if (next_segment != NULL) {
+      stream->write(
+          segment, static_cast<std::streamsize>(next_segment - segment));
+      *stream << "]]>]]&gt;<![CDATA[";
+      segment = next_segment + strlen("]]>");
+    } else {
+      *stream << segment;
+      break;
+    }
+  }
+  *stream << "]]>";
+}
+
+void XmlUnitTestResultPrinter::OutputXmlAttribute(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Attribute " << name << " is not allowed for element <" << element_name
+      << ">.";
+
+  *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
+}
+
+// Prints an XML representation of a TestInfo object.
+// TODO(wan): There is also value in printing properties with the plain printer.
+void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
+                                                 const char* test_case_name,
+                                                 const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestcase = "testcase";
+
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, kTestcase, "name", test_info.name());
+
+  if (test_info.value_param() != NULL) {
+    OutputXmlAttribute(stream, kTestcase, "value_param",
+                       test_info.value_param());
+  }
+  if (test_info.type_param() != NULL) {
+    OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param());
+  }
+
+  OutputXmlAttribute(stream, kTestcase, "status",
+                     test_info.should_run() ? "run" : "notrun");
+  OutputXmlAttribute(stream, kTestcase, "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(stream, kTestcase, "classname", test_case_name);
+  *stream << TestPropertiesAsXmlAttributes(result);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      if (++failures == 1) {
+        *stream << ">\n";
+      }
+      const string location = internal::FormatCompilerIndependentFileLocation(
+          part.file_name(), part.line_number());
+      const string summary = location + "\n" + part.summary();
+      *stream << "      <failure message=\""
+              << EscapeXmlAttribute(summary.c_str())
+              << "\" type=\"\">";
+      const string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</failure>\n";
+    }
+  }
+
+  if (failures == 0)
+    *stream << " />\n";
+  else
+    *stream << "    </testcase>\n";
+}
+
+// Prints an XML representation of a TestCase object
+void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
+                                                const TestCase& test_case) {
+  const std::string kTestsuite = "testsuite";
+  *stream << "  <" << kTestsuite;
+  OutputXmlAttribute(stream, kTestsuite, "name", test_case.name());
+  OutputXmlAttribute(stream, kTestsuite, "tests",
+                     StreamableToString(test_case.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuite, "failures",
+                     StreamableToString(test_case.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuite, "disabled",
+      StreamableToString(test_case.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+  OutputXmlAttribute(stream, kTestsuite, "time",
+                     FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
+  *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result())
+          << ">\n";
+
+  for (int i = 0; i < test_case.total_test_count(); ++i) {
+    if (test_case.GetTestInfo(i)->is_reportable())
+      OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
+  }
+  *stream << "  </" << kTestsuite << ">\n";
+}
+
+// Prints an XML summary of unit_test to output stream out.
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
+                                                const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(unit_test.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "failures",
+                     StreamableToString(unit_test.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuites, "disabled",
+      StreamableToString(unit_test.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "errors", "0");
+  OutputXmlAttribute(
+      stream, kTestsuites, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
+  OutputXmlAttribute(stream, kTestsuites, "time",
+                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
+
+  if (GTEST_FLAG(shuffle)) {
+    OutputXmlAttribute(stream, kTestsuites, "random_seed",
+                       StreamableToString(unit_test.random_seed()));
+  }
+
+  *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
+
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    if (unit_test.GetTestCase(i)->reportable_test_count() > 0)
+      PrintXmlTestCase(stream, *unit_test.GetTestCase(i));
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+// Produces a string representing the test properties in a result as space
+// delimited XML attributes based on the property key="value" pairs.
+std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
+    const TestResult& result) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << " " << property.key() << "="
+        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End XmlUnitTestResultPrinter
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
+// replaces them by "%xx" where xx is their hexadecimal value. For
+// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
+// in both time and space -- important as the input str may contain an
+// arbitrarily long test failure message and stack trace.
+string StreamingListener::UrlEncode(const char* str) {
+  string result;
+  result.reserve(strlen(str) + 1);
+  for (char ch = *str; ch != '\0'; ch = *++str) {
+    switch (ch) {
+      case '%':
+      case '=':
+      case '&':
+      case '\n':
+        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
+        break;
+      default:
+        result.push_back(ch);
+        break;
+    }
+  }
+  return result;
+}
+
+void StreamingListener::SocketWriter::MakeConnection() {
+  GTEST_CHECK_(sockfd_ == -1)
+      << "MakeConnection() can't be called when there is already a connection.";
+
+  addrinfo hints;
+  memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_socktype = SOCK_STREAM;
+  addrinfo* servinfo = NULL;
+
+  // Use the getaddrinfo() to get a linked list of IP addresses for
+  // the given host name.
+  const int error_num = getaddrinfo(
+      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  if (error_num != 0) {
+    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
+                        << gai_strerror(error_num);
+  }
+
+  // Loop through all the results and connect to the first we can.
+  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
+       cur_addr = cur_addr->ai_next) {
+    sockfd_ = socket(
+        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    if (sockfd_ != -1) {
+      // Connect the client socket to the server socket.
+      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
+        close(sockfd_);
+        sockfd_ = -1;
+      }
+    }
+  }
+
+  freeaddrinfo(servinfo);  // all done with this structure
+
+  if (sockfd_ == -1) {
+    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
+                        << host_name_ << ":" << port_num_;
+  }
+}
+
+// End of class Streaming Listener
+#endif  // GTEST_CAN_STREAM_RESULTS__
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+ScopedTrace::ScopedTrace(const char* file, int line, const Message& message)
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message = message.GetString();
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace()
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  UnitTest::GetInstance()->PopGTestTrace();
+}
+
+
+// class OsStackTraceGetter
+
+const char* const OsStackTraceGetterInterface::kElidedFramesMarker =
+    "... " GTEST_NAME_ " internal frames ...";
+
+string OsStackTraceGetter::CurrentStackTrace(int /*max_depth*/,
+                                             int /*skip_count*/) {
+  return "";
+}
+
+void OsStackTraceGetter::UponLeavingGTest() {}
+
+// A helper class that creates the premature-exit file in its
+// constructor and deletes the file in its destructor.
+class ScopedPrematureExitFile {
+ public:
+  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
+      : premature_exit_filepath_(premature_exit_filepath) {
+    // If a path to the premature-exit file is specified...
+    if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') {
+      // create the file with a single "0" character in it.  I/O
+      // errors are ignored as there's nothing better we can do and we
+      // don't want to fail the test because of this.
+      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      fwrite("0", 1, 1, pfile);
+      fclose(pfile);
+    }
+  }
+
+  ~ScopedPrematureExitFile() {
+    if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') {
+      remove(premature_exit_filepath_);
+    }
+  }
+
+ private:
+  const char* const premature_exit_filepath_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+};
+
+}  // namespace internal
+
+// class TestEventListeners
+
+TestEventListeners::TestEventListeners()
+    : repeater_(new internal::TestEventRepeater()),
+      default_result_printer_(NULL),
+      default_xml_generator_(NULL) {
+}
+
+TestEventListeners::~TestEventListeners() { delete repeater_; }
+
+// Returns the standard listener responsible for the default console
+// output.  Can be removed from the listeners list to shut down default
+// console output.  Note that removing this object from the listener list
+// with Release transfers its ownership to the user.
+void TestEventListeners::Append(TestEventListener* listener) {
+  repeater_->Append(listener);
+}
+
+// Removes the given event listener from the list and returns it.  It then
+// becomes the caller's responsibility to delete the listener. Returns
+// NULL if the listener is not found in the list.
+TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
+  if (listener == default_result_printer_)
+    default_result_printer_ = NULL;
+  else if (listener == default_xml_generator_)
+    default_xml_generator_ = NULL;
+  return repeater_->Release(listener);
+}
+
+// Returns repeater that broadcasts the TestEventListener events to all
+// subscribers.
+TestEventListener* TestEventListeners::repeater() { return repeater_; }
+
+// Sets the default_result_printer attribute to the provided listener.
+// The listener is also added to the listener list and previous
+// default_result_printer is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
+  if (default_result_printer_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_result_printer_);
+    default_result_printer_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Sets the default_xml_generator attribute to the provided listener.  The
+// listener is also added to the listener list and previous
+// default_xml_generator is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
+  if (default_xml_generator_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_xml_generator_);
+    default_xml_generator_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Controls whether events will be forwarded by the repeater to the
+// listeners in the list.
+bool TestEventListeners::EventForwardingEnabled() const {
+  return repeater_->forwarding_enabled();
+}
+
+void TestEventListeners::SuppressEventForwarding() {
+  repeater_->set_forwarding_enabled(false);
+}
+
+// class UnitTest
+
+// Gets the singleton UnitTest object.  The first time this method is
+// called, a UnitTest object is constructed and returned.  Consecutive
+// calls will return the same object.
+//
+// We don't protect this under mutex_ as a user is not supposed to
+// call this before main() starts, from which point on the return
+// value will never change.
+UnitTest* UnitTest::GetInstance() {
+  // When compiled with MSVC 7.1 in optimized mode, destroying the
+  // UnitTest object upon exiting the program messes up the exit code,
+  // causing successful tests to appear failed.  We have to use a
+  // different implementation in this case to bypass the compiler bug.
+  // This implementation makes the compiler happy, at the cost of
+  // leaking the UnitTest object.
+
+  // CodeGear C++Builder insists on a public destructor for the
+  // default implementation.  Use this implementation to keep good OO
+  // design with private destructor.
+
+#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+  static UnitTest* const instance = new UnitTest;
+  return instance;
+#else
+  static UnitTest instance;
+  return &instance;
+#endif  // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+}
+
+// Gets the number of successful test cases.
+int UnitTest::successful_test_case_count() const {
+  return impl()->successful_test_case_count();
+}
+
+// Gets the number of failed test cases.
+int UnitTest::failed_test_case_count() const {
+  return impl()->failed_test_case_count();
+}
+
+// Gets the number of all test cases.
+int UnitTest::total_test_case_count() const {
+  return impl()->total_test_case_count();
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTest::test_case_to_run_count() const {
+  return impl()->test_case_to_run_count();
+}
+
+// Gets the number of successful tests.
+int UnitTest::successful_test_count() const {
+  return impl()->successful_test_count();
+}
+
+// Gets the number of failed tests.
+int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTest::reportable_disabled_test_count() const {
+  return impl()->reportable_disabled_test_count();
+}
+
+// Gets the number of disabled tests.
+int UnitTest::disabled_test_count() const {
+  return impl()->disabled_test_count();
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTest::reportable_test_count() const {
+  return impl()->reportable_test_count();
+}
+
+// Gets the number of all tests.
+int UnitTest::total_test_count() const { return impl()->total_test_count(); }
+
+// Gets the number of tests that should run.
+int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
+
+// Gets the time of the test program start, in ms from the start of the
+// UNIX epoch.
+internal::TimeInMillis UnitTest::start_timestamp() const {
+    return impl()->start_timestamp();
+}
+
+// Gets the elapsed time, in milliseconds.
+internal::TimeInMillis UnitTest::elapsed_time() const {
+  return impl()->elapsed_time();
+}
+
+// Returns true iff the unit test passed (i.e. all test cases passed).
+bool UnitTest::Passed() const { return impl()->Passed(); }
+
+// Returns true iff the unit test failed (i.e. some test case failed
+// or something outside of all tests failed).
+bool UnitTest::Failed() const { return impl()->Failed(); }
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+const TestCase* UnitTest::GetTestCase(int i) const {
+  return impl()->GetTestCase(i);
+}
+
+// Returns the TestResult containing information on test failures and
+// properties logged outside of individual test cases.
+const TestResult& UnitTest::ad_hoc_test_result() const {
+  return *impl()->ad_hoc_test_result();
+}
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+TestCase* UnitTest::GetMutableTestCase(int i) {
+  return impl()->GetMutableTestCase(i);
+}
+
+// Returns the list of event listeners that can be used to track events
+// inside Google Test.
+TestEventListeners& UnitTest::listeners() {
+  return *impl()->listeners();
+}
+
+// Registers and returns a global test environment.  When a test
+// program is run, all global test environments will be set-up in the
+// order they were registered.  After all tests in the program have
+// finished, all global test environments will be torn-down in the
+// *reverse* order they were registered.
+//
+// The UnitTest object takes ownership of the given environment.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+Environment* UnitTest::AddEnvironment(Environment* env) {
+  if (env == NULL) {
+    return NULL;
+  }
+
+  impl_->environments().push_back(env);
+  return env;
+}
+
+// Adds a TestPartResult to the current TestResult object.  All Google Test
+// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
+// this to report their results.  The user code should use the
+// assertion macros instead of calling this directly.
+void UnitTest::AddTestPartResult(
+    TestPartResult::Type result_type,
+    const char* file_name,
+    int line_number,
+    const std::string& message,
+    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
+  Message msg;
+  msg << message;
+
+  internal::MutexLock lock(&mutex_);
+  if (impl_->gtest_trace_stack().size() > 0) {
+    msg << "\n" << GTEST_NAME_ << " trace:";
+
+    for (int i = static_cast<int>(impl_->gtest_trace_stack().size());
+         i > 0; --i) {
+      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
+      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
+          << " " << trace.message;
+    }
+  }
+
+  if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) {
+    msg << internal::kStackTraceMarker << os_stack_trace;
+  }
+
+  const TestPartResult result =
+    TestPartResult(result_type, file_name, line_number,
+                   msg.GetString().c_str());
+  impl_->GetTestPartResultReporterForCurrentThread()->
+      ReportTestPartResult(result);
+
+  if (result_type != TestPartResult::kSuccess) {
+    // gtest_break_on_failure takes precedence over
+    // gtest_throw_on_failure.  This allows a user to set the latter
+    // in the code (perhaps in order to use Google Test assertions
+    // with another testing framework) and specify the former on the
+    // command line for debugging.
+    if (GTEST_FLAG(break_on_failure)) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+      // Using DebugBreak on Windows allows gtest to still break into a debugger
+      // when a failure happens and both the --gtest_break_on_failure and
+      // the --gtest_catch_exceptions flags are specified.
+      DebugBreak();
+#else
+      // Dereference NULL through a volatile pointer to prevent the compiler
+      // from removing. We use this rather than abort() or __builtin_trap() for
+      // portability: Symbian doesn't implement abort() well, and some debuggers
+      // don't correctly trap abort().
+      *static_cast<volatile int*>(NULL) = 1;
+#endif  // GTEST_OS_WINDOWS
+    } else if (GTEST_FLAG(throw_on_failure)) {
+#if GTEST_HAS_EXCEPTIONS
+      throw internal::GoogleTestFailureException(result);
+#else
+      // We cannot call abort() as it generates a pop-up in debug mode
+      // that cannot be suppressed in VC 7.1 or below.
+      exit(1);
+#endif
+    }
+  }
+}
+
+// Adds a TestProperty to the current TestResult object when invoked from
+// inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+// from SetUpTestCase or TearDownTestCase, or to the global property set
+// when invoked elsewhere.  If the result already contains a property with
+// the same key, the value will be updated.
+void UnitTest::RecordProperty(const std::string& key,
+                              const std::string& value) {
+  impl_->RecordProperty(TestProperty(key, value));
+}
+
+// Runs all tests in this UnitTest object and prints the result.
+// Returns 0 if successful, or 1 otherwise.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+int UnitTest::Run() {
+  const bool in_death_test_child_process =
+      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+
+  // Google Test implements this protocol for catching that a test
+  // program exits before returning control to Google Test:
+  //
+  //   1. Upon start, Google Test creates a file whose absolute path
+  //      is specified by the environment variable
+  //      TEST_PREMATURE_EXIT_FILE.
+  //   2. When Google Test has finished its work, it deletes the file.
+  //
+  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
+  // running a Google-Test-based test program and check the existence
+  // of the file at the end of the test execution to see if it has
+  // exited prematurely.
+
+  // If we are in the child process of a death test, don't
+  // create/delete the premature exit file, as doing so is unnecessary
+  // and will confuse the parent process.  Otherwise, create/delete
+  // the file upon entering/leaving this function.  If the program
+  // somehow exits before this function has a chance to return, the
+  // premature-exit file will be left undeleted, causing a test runner
+  // that understands the premature-exit-file protocol to report the
+  // test as having failed.
+  const internal::ScopedPrematureExitFile premature_exit_file(
+      in_death_test_child_process ?
+      NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
+
+  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
+  // used for the duration of the program.
+  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+
+#if GTEST_HAS_SEH
+  // Either the user wants Google Test to catch exceptions thrown by the
+  // tests or this is executing in the context of death test child
+  // process. In either case the user does not want to see pop-up dialogs
+  // about crashes - they are expected.
+  if (impl()->catch_exceptions() || in_death_test_child_process) {
+# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+    // SetErrorMode doesn't exist on CE.
+    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
+                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
+# endif  // !GTEST_OS_WINDOWS_MOBILE
+
+# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+    // Death test children can be terminated with _abort().  On Windows,
+    // _abort() can show a dialog with a warning message.  This forces the
+    // abort message to go to stderr instead.
+    _set_error_mode(_OUT_TO_STDERR);
+# endif
+
+# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+    // In the debug version, Visual Studio pops up a separate dialog
+    // offering a choice to debug the aborted program. We need to suppress
+    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
+    // executed. Google Test will notify the user of any unexpected
+    // failure via stderr.
+    //
+    // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
+    // Users of prior VC versions shall suffer the agony and pain of
+    // clicking through the countless debug dialogs.
+    // TODO(vladl@google.com): find a way to suppress the abort dialog() in the
+    // debug mode when compiled with VC 7.1 or lower.
+    if (!GTEST_FLAG(break_on_failure))
+      _set_abort_behavior(
+          0x0,                                    // Clear the following flags:
+          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
+# endif
+  }
+#endif  // GTEST_HAS_SEH
+
+  return internal::HandleExceptionsInMethodIfSupported(
+      impl(),
+      &internal::UnitTestImpl::RunAllTests,
+      "auxiliary test code (environments or event listeners)") ? 0 : 1;
+}
+
+// Returns the working directory when the first TEST() or TEST_F() was
+// executed.
+const char* UnitTest::original_working_dir() const {
+  return impl_->original_working_dir_.c_str();
+}
+
+// Returns the TestCase object for the test that's currently running,
+// or NULL if no test is running.
+const TestCase* UnitTest::current_test_case() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_case();
+}
+
+// Returns the TestInfo object for the test that's currently running,
+// or NULL if no test is running.
+const TestInfo* UnitTest::current_test_info() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_info();
+}
+
+// Returns the random seed used at the start of the current test run.
+int UnitTest::random_seed() const { return impl_->random_seed(); }
+
+#if GTEST_HAS_PARAM_TEST
+// Returns ParameterizedTestCaseRegistry object used to keep track of
+// value-parameterized tests and instantiate and register them.
+internal::ParameterizedTestCaseRegistry&
+    UnitTest::parameterized_test_registry()
+        GTEST_LOCK_EXCLUDED_(mutex_) {
+  return impl_->parameterized_test_registry();
+}
+#endif  // GTEST_HAS_PARAM_TEST
+
+// Creates an empty UnitTest.
+UnitTest::UnitTest() {
+  impl_ = new internal::UnitTestImpl(this);
+}
+
+// Destructor of UnitTest.
+UnitTest::~UnitTest() {
+  delete impl_;
+}
+
+// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+// Google Test trace stack.
+void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().push_back(trace);
+}
+
+// Pops a trace from the per-thread Google Test trace stack.
+void UnitTest::PopGTestTrace()
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().pop_back();
+}
+
+namespace internal {
+
+UnitTestImpl::UnitTestImpl(UnitTest* parent)
+    : parent_(parent),
+      GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */)
+      default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+      GTEST_DISABLE_MSC_WARNINGS_POP_()
+      global_test_part_result_repoter_(
+          &default_global_test_part_result_reporter_),
+      per_thread_test_part_result_reporter_(
+          &default_per_thread_test_part_result_reporter_),
+#if GTEST_HAS_PARAM_TEST
+      parameterized_test_registry_(),
+      parameterized_tests_registered_(false),
+#endif  // GTEST_HAS_PARAM_TEST
+      last_death_test_case_(-1),
+      current_test_case_(NULL),
+      current_test_info_(NULL),
+      ad_hoc_test_result_(),
+      os_stack_trace_getter_(NULL),
+      post_flag_parse_init_performed_(false),
+      random_seed_(0),  // Will be overridden by the flag before first use.
+      random_(0),  // Will be reseeded before first use.
+      start_timestamp_(0),
+      elapsed_time_(0),
+#if GTEST_HAS_DEATH_TEST
+      death_test_factory_(new DefaultDeathTestFactory),
+#endif
+      // Will be overridden by the flag before first use.
+      catch_exceptions_(false) {
+  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
+}
+
+UnitTestImpl::~UnitTestImpl() {
+  // Deletes every TestCase.
+  ForEach(test_cases_, internal::Delete<TestCase>);
+
+  // Deletes every Environment.
+  ForEach(environments_, internal::Delete<Environment>);
+
+  delete os_stack_trace_getter_;
+}
+
+// Adds a TestProperty to the current TestResult object when invoked in a
+// context of a test, to current test case's ad_hoc_test_result when invoke
+// from SetUpTestCase/TearDownTestCase, or to the global property set
+// otherwise.  If the result already contains a property with the same key,
+// the value will be updated.
+void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
+  std::string xml_element;
+  TestResult* test_result;  // TestResult appropriate for property recording.
+
+  if (current_test_info_ != NULL) {
+    xml_element = "testcase";
+    test_result = &(current_test_info_->result_);
+  } else if (current_test_case_ != NULL) {
+    xml_element = "testsuite";
+    test_result = &(current_test_case_->ad_hoc_test_result_);
+  } else {
+    xml_element = "testsuites";
+    test_result = &ad_hoc_test_result_;
+  }
+  test_result->RecordProperty(xml_element, test_property);
+}
+
+#if GTEST_HAS_DEATH_TEST
+// Disables event forwarding if the control is currently in a death test
+// subprocess. Must not be called before InitGoogleTest.
+void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
+  if (internal_run_death_test_flag_.get() != NULL)
+    listeners()->SuppressEventForwarding();
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Initializes event listeners performing XML output as specified by
+// UnitTestOptions. Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureXmlOutput() {
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml") {
+    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format != "") {
+    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
+           output_format.c_str());
+    fflush(stdout);
+  }
+}
+
+#if GTEST_CAN_STREAM_RESULTS_
+// Initializes event listeners for streaming test results in string form.
+// Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureStreamingOutput() {
+  const std::string& target = GTEST_FLAG(stream_result_to);
+  if (!target.empty()) {
+    const size_t pos = target.find(':');
+    if (pos != std::string::npos) {
+      listeners()->Append(new StreamingListener(target.substr(0, pos),
+                                                target.substr(pos+1)));
+    } else {
+      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
+             target.c_str());
+      fflush(stdout);
+    }
+  }
+}
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+// Performs initialization dependent upon flag values obtained in
+// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+// this function is also called from RunAllTests.  Since this function can be
+// called more than once, it has to be idempotent.
+void UnitTestImpl::PostFlagParsingInit() {
+  // Ensures that this function does not execute more than once.
+  if (!post_flag_parse_init_performed_) {
+    post_flag_parse_init_performed_ = true;
+
+#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+    // Register to send notifications about key process state changes.
+    listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_());
+#endif  // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+
+#if GTEST_HAS_DEATH_TEST
+    InitDeathTestSubprocessControlInfo();
+    SuppressTestEventsIfInSubprocess();
+#endif  // GTEST_HAS_DEATH_TEST
+
+    // Registers parameterized tests. This makes parameterized tests
+    // available to the UnitTest reflection API without running
+    // RUN_ALL_TESTS.
+    RegisterParameterizedTests();
+
+    // Configures listeners for XML output. This makes it possible for users
+    // to shut down the default XML output before invoking RUN_ALL_TESTS.
+    ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+    // Configures listeners for streaming test results to the specified server.
+    ConfigureStreamingOutput();
+#endif  // GTEST_CAN_STREAM_RESULTS_
+  }
+}
+
+// A predicate that checks the name of a TestCase against a known
+// value.
+//
+// This is used for implementation of the UnitTest class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestCaseNameIs is copyable.
+class TestCaseNameIs {
+ public:
+  // Constructor.
+  explicit TestCaseNameIs(const std::string& name)
+      : name_(name) {}
+
+  // Returns true iff the name of test_case matches name_.
+  bool operator()(const TestCase* test_case) const {
+    return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0;
+  }
+
+ private:
+  std::string name_;
+};
+
+// Finds and returns a TestCase with the given name.  If one doesn't
+// exist, creates one and returns it.  It's the CALLER'S
+// RESPONSIBILITY to ensure that this function is only called WHEN THE
+// TESTS ARE NOT SHUFFLED.
+//
+// Arguments:
+//
+//   test_case_name: name of the test case
+//   type_param:     the name of the test case's type parameter, or NULL if
+//                   this is not a typed or a type-parameterized test case.
+//   set_up_tc:      pointer to the function that sets up the test case
+//   tear_down_tc:   pointer to the function that tears down the test case
+TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
+                                    const char* type_param,
+                                    Test::SetUpTestCaseFunc set_up_tc,
+                                    Test::TearDownTestCaseFunc tear_down_tc) {
+  // Can we find a TestCase with the given name?
+  const std::vector<TestCase*>::const_iterator test_case =
+      std::find_if(test_cases_.begin(), test_cases_.end(),
+                   TestCaseNameIs(test_case_name));
+
+  if (test_case != test_cases_.end())
+    return *test_case;
+
+  // No.  Let's create one.
+  TestCase* const new_test_case =
+      new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc);
+
+  // Is this a death test case?
+  if (internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                               kDeathTestCaseFilter)) {
+    // Yes.  Inserts the test case after the last death test case
+    // defined so far.  This only works when the test cases haven't
+    // been shuffled.  Otherwise we may end up running a death test
+    // after a non-death test.
+    ++last_death_test_case_;
+    test_cases_.insert(test_cases_.begin() + last_death_test_case_,
+                       new_test_case);
+  } else {
+    // No.  Appends to the end of the list.
+    test_cases_.push_back(new_test_case);
+  }
+
+  test_case_indices_.push_back(static_cast<int>(test_case_indices_.size()));
+  return new_test_case;
+}
+
+// Helpers for setting up / tearing down the given environment.  They
+// are for use in the ForEach() function.
+static void SetUpEnvironment(Environment* env) { env->SetUp(); }
+static void TearDownEnvironment(Environment* env) { env->TearDown(); }
+
+// Runs all tests in this UnitTest object, prints the result, and
+// returns true if all tests are successful.  If any exception is
+// thrown during a test, the test is considered to be failed, but the
+// rest of the tests will still be run.
+//
+// When parameterized tests are enabled, it expands and registers
+// parameterized tests first in RegisterParameterizedTests().
+// All other functions called from RunAllTests() may safely assume that
+// parameterized tests are ready to be counted and run.
+bool UnitTestImpl::RunAllTests() {
+  // Makes sure InitGoogleTest() was called.
+  if (!GTestIsInitialized()) {
+    printf("%s",
+           "\nThis test program did NOT call ::testing::InitGoogleTest "
+           "before calling RUN_ALL_TESTS().  Please fix it.\n");
+    return false;
+  }
+
+  // Do not run any test if the --help flag was specified.
+  if (g_help_flag)
+    return true;
+
+  // Repeats the call to the post-flag parsing initialization in case the
+  // user didn't call InitGoogleTest.
+  PostFlagParsingInit();
+
+  // Even if sharding is not on, test runners may want to use the
+  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
+  // protocol.
+  internal::WriteToShardStatusFileIfNeeded();
+
+  // True iff we are in a subprocess for running a thread-safe-style
+  // death test.
+  bool in_subprocess_for_death_test = false;
+
+#if GTEST_HAS_DEATH_TEST
+  in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
+# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+  if (in_subprocess_for_death_test) {
+    GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
+  }
+# endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#endif  // GTEST_HAS_DEATH_TEST
+
+  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
+                                        in_subprocess_for_death_test);
+
+  // Compares the full test names with the filter to decide which
+  // tests to run.
+  const bool has_tests_to_run = FilterTests(should_shard
+                                              ? HONOR_SHARDING_PROTOCOL
+                                              : IGNORE_SHARDING_PROTOCOL) > 0;
+
+  // Lists the tests and exits if the --gtest_list_tests flag was specified.
+  if (GTEST_FLAG(list_tests)) {
+    // This must be called *after* FilterTests() has been called.
+    ListTestsMatchingFilter();
+    return true;
+  }
+
+  random_seed_ = GTEST_FLAG(shuffle) ?
+      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+
+  // True iff at least one test has failed.
+  bool failed = false;
+
+  TestEventListener* repeater = listeners()->repeater();
+
+  start_timestamp_ = GetTimeInMillis();
+  repeater->OnTestProgramStart(*parent_);
+
+  // How many times to repeat the tests?  We don't want to repeat them
+  // when we are inside the subprocess of a death test.
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+  // Repeats forever if the repeat count is negative.
+  const bool forever = repeat < 0;
+  for (int i = 0; forever || i != repeat; i++) {
+    // We want to preserve failures generated by ad-hoc test
+    // assertions executed before RUN_ALL_TESTS().
+    ClearNonAdHocTestResult();
+
+    const TimeInMillis start = GetTimeInMillis();
+
+    // Shuffles test cases and tests if requested.
+    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
+      random()->Reseed(random_seed_);
+      // This should be done before calling OnTestIterationStart(),
+      // such that a test event listener can see the actual test order
+      // in the event.
+      ShuffleTests();
+    }
+
+    // Tells the unit test event listeners that the tests are about to start.
+    repeater->OnTestIterationStart(*parent_, i);
+
+    // Runs each test case if there is at least one test to run.
+    if (has_tests_to_run) {
+      // Sets up all environments beforehand.
+      repeater->OnEnvironmentsSetUpStart(*parent_);
+      ForEach(environments_, SetUpEnvironment);
+      repeater->OnEnvironmentsSetUpEnd(*parent_);
+
+      // Runs the tests only if there was no fatal failure during global
+      // set-up.
+      if (!Test::HasFatalFailure()) {
+        for (int test_index = 0; test_index < total_test_case_count();
+             test_index++) {
+          GetMutableTestCase(test_index)->Run();
+        }
+      }
+
+      // Tears down all environments in reverse order afterwards.
+      repeater->OnEnvironmentsTearDownStart(*parent_);
+      std::for_each(environments_.rbegin(), environments_.rend(),
+                    TearDownEnvironment);
+      repeater->OnEnvironmentsTearDownEnd(*parent_);
+    }
+
+    elapsed_time_ = GetTimeInMillis() - start;
+
+    // Tells the unit test event listener that the tests have just finished.
+    repeater->OnTestIterationEnd(*parent_, i);
+
+    // Gets the result and clears it.
+    if (!Passed()) {
+      failed = true;
+    }
+
+    // Restores the original test order after the iteration.  This
+    // allows the user to quickly repro a failure that happens in the
+    // N-th iteration without repeating the first (N - 1) iterations.
+    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
+    // case the user somehow changes the value of the flag somewhere
+    // (it's always safe to unshuffle the tests).
+    UnshuffleTests();
+
+    if (GTEST_FLAG(shuffle)) {
+      // Picks a new random seed for each iteration.
+      random_seed_ = GetNextRandomSeed(random_seed_);
+    }
+  }
+
+  repeater->OnTestProgramEnd(*parent_);
+
+  return !failed;
+}
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded() {
+  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
+  if (test_shard_file != NULL) {
+    FILE* const file = posix::FOpen(test_shard_file, "w");
+    if (file == NULL) {
+      ColoredPrintf(COLOR_RED,
+                    "Could not write to the test shard status file \"%s\" "
+                    "specified by the %s environment variable.\n",
+                    test_shard_file, kTestShardStatusFile);
+      fflush(stdout);
+      exit(EXIT_FAILURE);
+    }
+    fclose(file);
+  }
+}
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (i.e., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+bool ShouldShard(const char* total_shards_env,
+                 const char* shard_index_env,
+                 bool in_subprocess_for_death_test) {
+  if (in_subprocess_for_death_test) {
+    return false;
+  }
+
+  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
+  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
+
+  if (total_shards == -1 && shard_index == -1) {
+    return false;
+  } else if (total_shards == -1 && shard_index != -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestShardIndex << " = " << shard_index
+      << ", but have left " << kTestTotalShards << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (total_shards != -1 && shard_index == -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestTotalShards << " = " << total_shards
+      << ", but have left " << kTestShardIndex << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (shard_index < 0 || shard_index >= total_shards) {
+    const Message msg = Message()
+      << "Invalid environment variables: we require 0 <= "
+      << kTestShardIndex << " < " << kTestTotalShards
+      << ", but you have " << kTestShardIndex << "=" << shard_index
+      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  }
+
+  return total_shards > 1;
+}
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error
+// and aborts.
+Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
+  const char* str_val = posix::GetEnv(var);
+  if (str_val == NULL) {
+    return default_val;
+  }
+
+  Int32 result;
+  if (!ParseInt32(Message() << "The value of environment variable " << var,
+                  str_val, &result)) {
+    exit(EXIT_FAILURE);
+  }
+  return result;
+}
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
+  return (test_id % total_shards) == shard_index;
+}
+
+// Compares the name of each test with the user-specified filter to
+// decide whether the test should be run, then records the result in
+// each TestCase and TestInfo object.
+// If shard_tests == true, further filters tests based on sharding
+// variables in the environment - see
+// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
+// Returns the number of tests that should run.
+int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
+  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
+  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
+
+  // num_runnable_tests are the number of tests that will
+  // run across all shards (i.e., match filter and are not disabled).
+  // num_selected_tests are the number of tests to be run on
+  // this shard.
+  int num_runnable_tests = 0;
+  int num_selected_tests = 0;
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    TestCase* const test_case = test_cases_[i];
+    const std::string &test_case_name = test_case->name();
+    test_case->set_should_run(false);
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      TestInfo* const test_info = test_case->test_info_list()[j];
+      const std::string test_name(test_info->name());
+      // A test is disabled if test case name or test name matches
+      // kDisableTestFilter.
+      const bool is_disabled =
+          internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                                   kDisableTestFilter) ||
+          internal::UnitTestOptions::MatchesFilter(test_name,
+                                                   kDisableTestFilter);
+      test_info->is_disabled_ = is_disabled;
+
+      const bool matches_filter =
+          internal::UnitTestOptions::FilterMatchesTest(test_case_name,
+                                                       test_name);
+      test_info->matches_filter_ = matches_filter;
+
+      const bool is_runnable =
+          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+          matches_filter;
+
+      const bool is_selected = is_runnable &&
+          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
+           ShouldRunTestOnShard(total_shards, shard_index,
+                                num_runnable_tests));
+
+      num_runnable_tests += is_runnable;
+      num_selected_tests += is_selected;
+
+      test_info->should_run_ = is_selected;
+      test_case->set_should_run(test_case->should_run() || is_selected);
+    }
+  }
+  return num_selected_tests;
+}
+
+// Prints the given C-string on a single line by replacing all '\n'
+// characters with string "\\n".  If the output takes more than
+// max_length characters, only prints the first max_length characters
+// and "...".
+static void PrintOnOneLine(const char* str, int max_length) {
+  if (str != NULL) {
+    for (int i = 0; *str != '\0'; ++str) {
+      if (i >= max_length) {
+        printf("...");
+        break;
+      }
+      if (*str == '\n') {
+        printf("\\n");
+        i += 2;
+      } else {
+        printf("%c", *str);
+        ++i;
+      }
+    }
+  }
+}
+
+// Prints the names of the tests matching the user-specified filter flag.
+void UnitTestImpl::ListTestsMatchingFilter() {
+  // Print at most this many characters for each type/value parameter.
+  const int kMaxParamLength = 250;
+
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    const TestCase* const test_case = test_cases_[i];
+    bool printed_test_case_name = false;
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      const TestInfo* const test_info =
+          test_case->test_info_list()[j];
+      if (test_info->matches_filter_) {
+        if (!printed_test_case_name) {
+          printed_test_case_name = true;
+          printf("%s.", test_case->name());
+          if (test_case->type_param() != NULL) {
+            printf("  # %s = ", kTypeParamLabel);
+            // We print the type parameter on a single line to make
+            // the output easy to parse by a program.
+            PrintOnOneLine(test_case->type_param(), kMaxParamLength);
+          }
+          printf("\n");
+        }
+        printf("  %s", test_info->name());
+        if (test_info->value_param() != NULL) {
+          printf("  # %s = ", kValueParamLabel);
+          // We print the value parameter on a single line to make the
+          // output easy to parse by a program.
+          PrintOnOneLine(test_info->value_param(), kMaxParamLength);
+        }
+        printf("\n");
+      }
+    }
+  }
+  fflush(stdout);
+}
+
+// Sets the OS stack trace getter.
+//
+// Does nothing if the input and the current OS stack trace getter are
+// the same; otherwise, deletes the old getter and makes the input the
+// current getter.
+void UnitTestImpl::set_os_stack_trace_getter(
+    OsStackTraceGetterInterface* getter) {
+  if (os_stack_trace_getter_ != getter) {
+    delete os_stack_trace_getter_;
+    os_stack_trace_getter_ = getter;
+  }
+}
+
+// Returns the current OS stack trace getter if it is not NULL;
+// otherwise, creates an OsStackTraceGetter, makes it the current
+// getter, and returns it.
+OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
+  if (os_stack_trace_getter_ == NULL) {
+#ifdef GTEST_OS_STACK_TRACE_GETTER_
+    os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_;
+#else
+    os_stack_trace_getter_ = new OsStackTraceGetter;
+#endif  // GTEST_OS_STACK_TRACE_GETTER_
+  }
+
+  return os_stack_trace_getter_;
+}
+
+// Returns the TestResult for the test that's currently running, or
+// the TestResult for the ad hoc test if no test is running.
+TestResult* UnitTestImpl::current_test_result() {
+  return current_test_info_ ?
+      &(current_test_info_->result_) : &ad_hoc_test_result_;
+}
+
+// Shuffles all test cases, and the tests within each test case,
+// making sure that death tests are still run first.
+void UnitTestImpl::ShuffleTests() {
+  // Shuffles the death test cases.
+  ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_);
+
+  // Shuffles the non-death test cases.
+  ShuffleRange(random(), last_death_test_case_ + 1,
+               static_cast<int>(test_cases_.size()), &test_case_indices_);
+
+  // Shuffles the tests inside each test case.
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    test_cases_[i]->ShuffleTests(random());
+  }
+}
+
+// Restores the test cases and tests to their order before the first shuffle.
+void UnitTestImpl::UnshuffleTests() {
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    // Unshuffles the tests in each test case.
+    test_cases_[i]->UnshuffleTests();
+    // Resets the index of each test case.
+    test_case_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
+                                            int skip_count) {
+  // We pass skip_count + 1 to skip this wrapper function in addition
+  // to what the user really wants to skip.
+  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
+}
+
+// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
+// suppress unreachable code warnings.
+namespace {
+class ClassUniqueToAlwaysTrue {};
+}
+
+bool IsTrue(bool condition) { return condition; }
+
+bool AlwaysTrue() {
+#if GTEST_HAS_EXCEPTIONS
+  // This condition is always false so AlwaysTrue() never actually throws,
+  // but it makes the compiler think that it may throw.
+  if (IsTrue(false))
+    throw ClassUniqueToAlwaysTrue();
+#endif  // GTEST_HAS_EXCEPTIONS
+  return true;
+}
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+bool SkipPrefix(const char* prefix, const char** pstr) {
+  const size_t prefix_len = strlen(prefix);
+  if (strncmp(*pstr, prefix, prefix_len) == 0) {
+    *pstr += prefix_len;
+    return true;
+  }
+  return false;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+const char* ParseFlagValue(const char* str,
+                           const char* flag,
+                           bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == NULL || flag == NULL) return NULL;
+
+  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
+  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return NULL;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for an Int32 flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(Message() << "The value of flag --" << flag,
+                    value_str, value);
+}
+
+// Parses a string for a string flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+// Determines whether a string has a prefix that Google Test uses for its
+// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
+// If Google Test detects that a command line flag has its prefix but is not
+// recognized, it will print its help message. Flags starting with
+// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
+// internal flags and do not trigger the help message.
+static bool HasGoogleTestFlagPrefix(const char* str) {
+  return (SkipPrefix("--", &str) ||
+          SkipPrefix("-", &str) ||
+          SkipPrefix("/", &str)) &&
+         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
+         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
+          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
+}
+
+// Prints a string containing code-encoded text.  The following escape
+// sequences can be used in the string to control the text color:
+//
+//   @@    prints a single '@' character.
+//   @R    changes the color to red.
+//   @G    changes the color to green.
+//   @Y    changes the color to yellow.
+//   @D    changes to the default terminal text color.
+//
+// TODO(wan@google.com): Write tests for this once we add stdout
+// capturing to Google Test.
+static void PrintColorEncoded(const char* str) {
+  GTestColor color = COLOR_DEFAULT;  // The current color.
+
+  // Conceptually, we split the string into segments divided by escape
+  // sequences.  Then we print one segment at a time.  At the end of
+  // each iteration, the str pointer advances to the beginning of the
+  // next segment.
+  for (;;) {
+    const char* p = strchr(str, '@');
+    if (p == NULL) {
+      ColoredPrintf(color, "%s", str);
+      return;
+    }
+
+    ColoredPrintf(color, "%s", std::string(str, p).c_str());
+
+    const char ch = p[1];
+    str = p + 2;
+    if (ch == '@') {
+      ColoredPrintf(color, "@");
+    } else if (ch == 'D') {
+      color = COLOR_DEFAULT;
+    } else if (ch == 'R') {
+      color = COLOR_RED;
+    } else if (ch == 'G') {
+      color = COLOR_GREEN;
+    } else if (ch == 'Y') {
+      color = COLOR_YELLOW;
+    } else {
+      --str;
+    }
+  }
+}
+
+static const char kColorEncodedHelpMessage[] =
+"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
+"following command line flags to control its behavior:\n"
+"\n"
+"Test Selection:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
+"      List the names of all tests instead of running them. The name of\n"
+"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
+    "[@G-@YNEGATIVE_PATTERNS]@D\n"
+"      Run only the tests whose name matches one of the positive patterns but\n"
+"      none of the negative patterns. '?' matches any single character; '*'\n"
+"      matches any substring; ':' separates two patterns.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
+"      Run all disabled tests too.\n"
+"\n"
+"Test Execution:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
+"      Run the tests repeatedly; use a negative count to repeat forever.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
+"      Randomize tests' orders on every iteration.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
+"      Random number seed to use for shuffling test orders (between 1 and\n"
+"      99999, or 0 to use a seed based on the current time).\n"
+"\n"
+"Test Output:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+"      Enable/disable colored output. The default is @Gauto@D.\n"
+"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
+"      Don't print the elapsed time of each test.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G"
+    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
+"      Generate an XML report in the given directory or with the given file\n"
+"      name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
+#if GTEST_CAN_STREAM_RESULTS_
+"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
+"      Stream test results to the given server.\n"
+#endif  // GTEST_CAN_STREAM_RESULTS_
+"\n"
+"Assertion Behavior:\n"
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+"      Set the default death test style.\n"
+#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
+"      Turn assertion failures into debugger break-points.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
+"      Turn assertion failures into C++ exceptions.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
+"      Do not report exceptions as test failures. Instead, allow them\n"
+"      to crash the program or throw a pop-up (on Windows).\n"
+"\n"
+"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
+    "the corresponding\n"
+"environment variable of a flag (all letters in upper-case). For example, to\n"
+"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
+    "color=no@D or set\n"
+"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
+"\n"
+"For more information, please read the " GTEST_NAME_ " documentation at\n"
+"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
+"(not one in your own code or tests), please report it to\n"
+"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+
+bool ParseGoogleTestFlag(const char* const arg) {
+  return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
+                       &GTEST_FLAG(also_run_disabled_tests)) ||
+      ParseBoolFlag(arg, kBreakOnFailureFlag,
+                    &GTEST_FLAG(break_on_failure)) ||
+      ParseBoolFlag(arg, kCatchExceptionsFlag,
+                    &GTEST_FLAG(catch_exceptions)) ||
+      ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+      ParseStringFlag(arg, kDeathTestStyleFlag,
+                      &GTEST_FLAG(death_test_style)) ||
+      ParseBoolFlag(arg, kDeathTestUseFork,
+                    &GTEST_FLAG(death_test_use_fork)) ||
+      ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+      ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                      &GTEST_FLAG(internal_run_death_test)) ||
+      ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+      ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+      ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+      ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+      ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+      ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+      ParseInt32Flag(arg, kStackTraceDepthFlag,
+                     &GTEST_FLAG(stack_trace_depth)) ||
+      ParseStringFlag(arg, kStreamResultToFlag,
+                      &GTEST_FLAG(stream_result_to)) ||
+      ParseBoolFlag(arg, kThrowOnFailureFlag,
+                    &GTEST_FLAG(throw_on_failure));
+}
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+void LoadFlagsFromFile(const std::string& path) {
+  FILE* flagfile = posix::FOpen(path.c_str(), "r");
+  if (!flagfile) {
+    fprintf(stderr,
+            "Unable to open file \"%s\"\n",
+            GTEST_FLAG(flagfile).c_str());
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+  std::string contents(ReadEntireFile(flagfile));
+  posix::FClose(flagfile);
+  std::vector<std::string> lines;
+  SplitString(contents, '\n', &lines);
+  for (size_t i = 0; i < lines.size(); ++i) {
+    if (lines[i].empty())
+      continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str()))
+      g_help_flag = true;
+  }
+}
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.  The type parameter CharType can be
+// instantiated to either char or wchar_t.
+template <typename CharType>
+void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+  for (int i = 1; i < *argc; i++) {
+    const std::string arg_string = StreamableToString(argv[i]);
+    const char* const arg = arg_string.c_str();
+
+    using internal::ParseBoolFlag;
+    using internal::ParseInt32Flag;
+    using internal::ParseStringFlag;
+
+    bool remove_flag = false;
+    if (ParseGoogleTestFlag(arg)) {
+      remove_flag = true;
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (ParseStringFlag(arg, kFlagfileFlag, &GTEST_FLAG(flagfile))) {
+      LoadFlagsFromFile(GTEST_FLAG(flagfile));
+      remove_flag = true;
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (arg_string == "--help" || arg_string == "-h" ||
+               arg_string == "-?" || arg_string == "/?" ||
+               HasGoogleTestFlagPrefix(arg)) {
+      // Both help flag and unrecognized Google Test flags (excluding
+      // internal ones) trigger help display.
+      g_help_flag = true;
+    }
+
+    if (remove_flag) {
+      // Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    }
+  }
+
+  if (g_help_flag) {
+    // We print the help here instead of in RUN_ALL_TESTS(), as the
+    // latter may not be called at all if the user is using Google
+    // Test with another testing framework.
+    PrintColorEncoded(kColorEncodedHelpMessage);
+  }
+}
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+
+// The internal implementation of InitGoogleTest().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleTestImpl(int* argc, CharType** argv) {
+  // We don't want to run the initialization code twice.
+  if (GTestIsInitialized()) return;
+
+  if (*argc <= 0) return;
+
+  g_argvs.clear();
+  for (int i = 0; i != *argc; i++) {
+    g_argvs.push_back(StreamableToString(argv[i]));
+  }
+
+  ParseGoogleTestFlagsOnly(argc, argv);
+  GetUnitTestImpl()->PostFlagParsingInit();
+}
+
+}  // namespace internal
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+void InitGoogleTest(int* argc, char** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+void InitGoogleTest(int* argc, wchar_t** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+}  // namespace testing
diff --git a/libs/libvpx/third_party/libwebm/README.libvpx b/libs/libvpx/third_party/libwebm/README.libvpx
index 73f8303222..ebb5ff2f4d 100644
--- a/libs/libvpx/third_party/libwebm/README.libvpx
+++ b/libs/libvpx/third_party/libwebm/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 32d5ac49414a8914ec1e1f285f3f927c6e8ec29d
+Version: 0ae757087f5e6eb01dfea16cc09205b2425cfb74
 License: BSD
 License File: LICENSE.txt
 
diff --git a/libs/libvpx/third_party/libwebm/common/file_util.cc b/libs/libvpx/third_party/libwebm/common/file_util.cc
index 4f91318f3e..6dab146dd9 100644
--- a/libs/libvpx/third_party/libwebm/common/file_util.cc
+++ b/libs/libvpx/third_party/libwebm/common/file_util.cc
@@ -14,6 +14,7 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
 #include <fstream>
 #include <ios>
 
@@ -21,13 +22,23 @@ namespace libwebm {
 
 std::string GetTempFileName() {
 #if !defined _MSC_VER && !defined __MINGW32__
-  char temp_file_name_template[] = "libwebm_temp.XXXXXX";
+  std::string temp_file_name_template_str =
+      std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR") :
+                                               ".") +
+      "/libwebm_temp.XXXXXX";
+  char* temp_file_name_template =
+      new char[temp_file_name_template_str.length() + 1];
+  memset(temp_file_name_template, 0, temp_file_name_template_str.length() + 1);
+  temp_file_name_template_str.copy(temp_file_name_template,
+                                   temp_file_name_template_str.length(), 0);
   int fd = mkstemp(temp_file_name_template);
+  std::string temp_file_name =
+      (fd != -1) ? std::string(temp_file_name_template) : std::string();
+  delete[] temp_file_name_template;
   if (fd != -1) {
     close(fd);
-    return std::string(temp_file_name_template);
   }
-  return std::string();
+  return temp_file_name;
 #else
   char tmp_file_name[_MAX_PATH];
   errno_t err = tmpnam_s(tmp_file_name);
diff --git a/libs/libvpx/third_party/libwebm/common/hdr_util.cc b/libs/libvpx/third_party/libwebm/common/hdr_util.cc
index e1a9842fb6..e1618ce75a 100644
--- a/libs/libvpx/third_party/libwebm/common/hdr_util.cc
+++ b/libs/libvpx/third_party/libwebm/common/hdr_util.cc
@@ -7,12 +7,15 @@
 // be found in the AUTHORS file in the root of the source tree.
 #include "hdr_util.h"
 
+#include <climits>
 #include <cstddef>
 #include <new>
 
 #include "mkvparser/mkvparser.h"
 
 namespace libwebm {
+const int Vp9CodecFeatures::kValueNotPresent = INT_MAX;
+
 bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
                              PrimaryChromaticityPtr* muxer_pc) {
   muxer_pc->reset(new (std::nothrow)
@@ -29,9 +32,9 @@ bool MasteringMetadataValuePresent(double value) {
 bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
                            mkvmuxer::MasteringMetadata* muxer_mm) {
   if (MasteringMetadataValuePresent(parser_mm.luminance_max))
-    muxer_mm->luminance_max = parser_mm.luminance_max;
+    muxer_mm->set_luminance_max(parser_mm.luminance_max);
   if (MasteringMetadataValuePresent(parser_mm.luminance_min))
-    muxer_mm->luminance_min = parser_mm.luminance_min;
+    muxer_mm->set_luminance_min(parser_mm.luminance_min);
 
   PrimaryChromaticityPtr r_ptr(NULL);
   PrimaryChromaticityPtr g_ptr(NULL);
@@ -73,34 +76,37 @@ bool CopyColour(const mkvparser::Colour& parser_colour,
     return false;
 
   if (ColourValuePresent(parser_colour.matrix_coefficients))
-    muxer_colour->matrix_coefficients = parser_colour.matrix_coefficients;
+    muxer_colour->set_matrix_coefficients(parser_colour.matrix_coefficients);
   if (ColourValuePresent(parser_colour.bits_per_channel))
-    muxer_colour->bits_per_channel = parser_colour.bits_per_channel;
-  if (ColourValuePresent(parser_colour.chroma_subsampling_horz))
-    muxer_colour->chroma_subsampling_horz =
-        parser_colour.chroma_subsampling_horz;
-  if (ColourValuePresent(parser_colour.chroma_subsampling_vert))
-    muxer_colour->chroma_subsampling_vert =
-        parser_colour.chroma_subsampling_vert;
+    muxer_colour->set_bits_per_channel(parser_colour.bits_per_channel);
+  if (ColourValuePresent(parser_colour.chroma_subsampling_horz)) {
+    muxer_colour->set_chroma_subsampling_horz(
+        parser_colour.chroma_subsampling_horz);
+  }
+  if (ColourValuePresent(parser_colour.chroma_subsampling_vert)) {
+    muxer_colour->set_chroma_subsampling_vert(
+        parser_colour.chroma_subsampling_vert);
+  }
   if (ColourValuePresent(parser_colour.cb_subsampling_horz))
-    muxer_colour->cb_subsampling_horz = parser_colour.cb_subsampling_horz;
+    muxer_colour->set_cb_subsampling_horz(parser_colour.cb_subsampling_horz);
   if (ColourValuePresent(parser_colour.cb_subsampling_vert))
-    muxer_colour->cb_subsampling_vert = parser_colour.cb_subsampling_vert;
+    muxer_colour->set_cb_subsampling_vert(parser_colour.cb_subsampling_vert);
   if (ColourValuePresent(parser_colour.chroma_siting_horz))
-    muxer_colour->chroma_siting_horz = parser_colour.chroma_siting_horz;
+    muxer_colour->set_chroma_siting_horz(parser_colour.chroma_siting_horz);
   if (ColourValuePresent(parser_colour.chroma_siting_vert))
-    muxer_colour->chroma_siting_vert = parser_colour.chroma_siting_vert;
+    muxer_colour->set_chroma_siting_vert(parser_colour.chroma_siting_vert);
   if (ColourValuePresent(parser_colour.range))
-    muxer_colour->range = parser_colour.range;
-  if (ColourValuePresent(parser_colour.transfer_characteristics))
-    muxer_colour->transfer_characteristics =
-        parser_colour.transfer_characteristics;
+    muxer_colour->set_range(parser_colour.range);
+  if (ColourValuePresent(parser_colour.transfer_characteristics)) {
+    muxer_colour->set_transfer_characteristics(
+        parser_colour.transfer_characteristics);
+  }
   if (ColourValuePresent(parser_colour.primaries))
-    muxer_colour->primaries = parser_colour.primaries;
+    muxer_colour->set_primaries(parser_colour.primaries);
   if (ColourValuePresent(parser_colour.max_cll))
-    muxer_colour->max_cll = parser_colour.max_cll;
+    muxer_colour->set_max_cll(parser_colour.max_cll);
   if (ColourValuePresent(parser_colour.max_fall))
-    muxer_colour->max_fall = parser_colour.max_fall;
+    muxer_colour->set_max_fall(parser_colour.max_fall);
 
   if (parser_colour.mastering_metadata) {
     mkvmuxer::MasteringMetadata muxer_mm;
@@ -116,8 +122,8 @@ bool CopyColour(const mkvparser::Colour& parser_colour,
 //
 //   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 //  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-//  |    ID Byte    |             Length            |               |
-//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+               |
+//  |    ID Byte    |   Length      |                               |
+//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+                               |
 //  |                                                               |
 //  :               Bytes 1..Length of Codec Feature                :
 //  |                                                               |
@@ -132,51 +138,83 @@ bool CopyColour(const mkvparser::Colour& parser_colour,
 //
 // The X bit is reserved.
 //
-// Currently only profile level is supported. ID byte must be set to 1, and
-// length must be 1. Supported values are:
-//
-//   10: Level 1
-//   11: Level 1.1
-//   20: Level 2
-//   21: Level 2.1
-//   30: Level 3
-//   31: Level 3.1
-//   40: Level 4
-//   41: Level 4.1
-//   50: Level 5
-//   51: Level 5.1
-//   52: Level 5.2
-//   60: Level 6
-//   61: Level 6.1
-//   62: Level 6.2
-//
 // See the following link for more information:
 // http://www.webmproject.org/vp9/profiles/
-int ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length) {
-  const int kVpxCodecPrivateLength = 3;
-  if (!private_data || length != kVpxCodecPrivateLength)
-    return 0;
+bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length,
+                          Vp9CodecFeatures* features) {
+  const int kVpxCodecPrivateMinLength = 3;
+  if (!private_data || !features || length < kVpxCodecPrivateMinLength)
+    return false;
 
-  const uint8_t id_byte = *private_data;
-  if (id_byte != 1)
-    return 0;
+  const uint8_t kVp9ProfileId = 1;
+  const uint8_t kVp9LevelId = 2;
+  const uint8_t kVp9BitDepthId = 3;
+  const uint8_t kVp9ChromaSubsamplingId = 4;
+  const int kVpxFeatureLength = 1;
+  int offset = 0;
 
-  const int kVpxProfileLength = 1;
-  const uint8_t length_byte = private_data[1];
-  if (length_byte != kVpxProfileLength)
-    return 0;
+  // Set features to not set.
+  features->profile = Vp9CodecFeatures::kValueNotPresent;
+  features->level = Vp9CodecFeatures::kValueNotPresent;
+  features->bit_depth = Vp9CodecFeatures::kValueNotPresent;
+  features->chroma_subsampling = Vp9CodecFeatures::kValueNotPresent;
+  do {
+    const uint8_t id_byte = private_data[offset++];
+    const uint8_t length_byte = private_data[offset++];
+    if (length_byte != kVpxFeatureLength)
+      return false;
+    if (id_byte == kVp9ProfileId) {
+      const int priv_profile = static_cast<int>(private_data[offset++]);
+      if (priv_profile < 0 || priv_profile > 3)
+        return false;
+      if (features->profile != Vp9CodecFeatures::kValueNotPresent &&
+          features->profile != priv_profile) {
+        return false;
+      }
+      features->profile = priv_profile;
+    } else if (id_byte == kVp9LevelId) {
+      const int priv_level = static_cast<int>(private_data[offset++]);
 
-  const int level = static_cast<int>(private_data[2]);
+      const int kNumLevels = 14;
+      const int levels[kNumLevels] = {10, 11, 20, 21, 30, 31, 40,
+                                      41, 50, 51, 52, 60, 61, 62};
 
-  const int kNumLevels = 14;
-  const int levels[kNumLevels] = {10, 11, 20, 21, 30, 31, 40,
-                                  41, 50, 51, 52, 60, 61, 62};
+      for (int i = 0; i < kNumLevels; ++i) {
+        if (priv_level == levels[i]) {
+          if (features->level != Vp9CodecFeatures::kValueNotPresent &&
+              features->level != priv_level) {
+            return false;
+          }
+          features->level = priv_level;
+          break;
+        }
+      }
+      if (features->level == Vp9CodecFeatures::kValueNotPresent)
+        return false;
+    } else if (id_byte == kVp9BitDepthId) {
+      const int priv_profile = static_cast<int>(private_data[offset++]);
+      if (priv_profile != 8 && priv_profile != 10 && priv_profile != 12)
+        return false;
+      if (features->bit_depth != Vp9CodecFeatures::kValueNotPresent &&
+          features->bit_depth != priv_profile) {
+        return false;
+      }
+      features->bit_depth = priv_profile;
+    } else if (id_byte == kVp9ChromaSubsamplingId) {
+      const int priv_profile = static_cast<int>(private_data[offset++]);
+      if (priv_profile != 0 && priv_profile != 2 && priv_profile != 3)
+        return false;
+      if (features->chroma_subsampling != Vp9CodecFeatures::kValueNotPresent &&
+          features->chroma_subsampling != priv_profile) {
+        return false;
+      }
+      features->chroma_subsampling = priv_profile;
+    } else {
+      // Invalid ID.
+      return false;
+    }
+  } while (offset + kVpxCodecPrivateMinLength <= length);
 
-  for (int i = 0; i < kNumLevels; ++i) {
-    if (level == levels[i])
-      return level;
-  }
-
-  return 0;
+  return true;
 }
 }  // namespace libwebm
diff --git a/libs/libvpx/third_party/libwebm/common/hdr_util.h b/libs/libvpx/third_party/libwebm/common/hdr_util.h
index d30c2b9f2a..3ef5388fd0 100644
--- a/libs/libvpx/third_party/libwebm/common/hdr_util.h
+++ b/libs/libvpx/third_party/libwebm/common/hdr_util.h
@@ -28,7 +28,34 @@ namespace libwebm {
 // TODO(tomfinegan): These should be moved to libwebm_utils once c++11 is
 // required by libwebm.
 
+// Features of the VP9 codec that may be set in the CodecPrivate of a VP9 video
+// stream. A value of kValueNotPresent represents that the value was not set in
+// the CodecPrivate.
+struct Vp9CodecFeatures {
+  static const int kValueNotPresent;
+
+  Vp9CodecFeatures()
+      : profile(kValueNotPresent),
+        level(kValueNotPresent),
+        bit_depth(kValueNotPresent),
+        chroma_subsampling(kValueNotPresent) {}
+  ~Vp9CodecFeatures() {}
+
+  int profile;
+  int level;
+  int bit_depth;
+  int chroma_subsampling;
+};
+
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
 typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic pop
+#endif
 
 bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
                              PrimaryChromaticityPtr* muxer_pc);
@@ -43,8 +70,9 @@ bool ColourValuePresent(long long value);
 bool CopyColour(const mkvparser::Colour& parser_colour,
                 mkvmuxer::Colour* muxer_colour);
 
-// Returns VP9 profile upon success or 0 upon failure.
-int ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length);
+// Returns true if |features| is set to one or more valid values.
+bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length,
+                          Vp9CodecFeatures* features);
 
 }  // namespace libwebm
 
diff --git a/libs/libvpx/third_party/libwebm/common/webmids.h b/libs/libvpx/third_party/libwebm/common/webmids.h
index 32a0c5fb91..89d722a71b 100644
--- a/libs/libvpx/third_party/libwebm/common/webmids.h
+++ b/libs/libvpx/third_party/libwebm/common/webmids.h
@@ -124,6 +124,14 @@ enum MkvId {
   kMkvLuminanceMin = 0x55DA,
   // end mastering metadata
   // end colour
+  // projection
+  kMkvProjection = 0x7670,
+  kMkvProjectionType = 0x7671,
+  kMkvProjectionPrivate = 0x7672,
+  kMkvProjectionPoseYaw = 0x7673,
+  kMkvProjectionPosePitch = 0x7674,
+  kMkvProjectionPoseRoll = 0x7675,
+  // end projection
   // audio
   kMkvAudio = 0xE1,
   kMkvSamplingFrequency = 0xB5,
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index c79ce24ed3..15b9a908d8 100644
--- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -16,6 +16,7 @@
 #include <ctime>
 #include <memory>
 #include <new>
+#include <string>
 #include <vector>
 
 #include "common/webmids.h"
@@ -23,12 +24,26 @@
 #include "mkvmuxer/mkvwriter.h"
 #include "mkvparser/mkvparser.h"
 
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
 namespace mkvmuxer {
 
+const float PrimaryChromaticity::kChromaticityMin = 0.0f;
+const float PrimaryChromaticity::kChromaticityMax = 1.0f;
+const float MasteringMetadata::kMinLuminance = 0.0f;
+const float MasteringMetadata::kMinLuminanceMax = 999.99f;
+const float MasteringMetadata::kMaxLuminanceMax = 9999.99f;
 const float MasteringMetadata::kValueNotPresent = FLT_MAX;
 const uint64_t Colour::kValueNotPresent = UINT64_MAX;
 
 namespace {
+
+const char kDocTypeWebm[] = "webm";
+const char kDocTypeMatroska[] = "matroska";
+
 // Deallocate the string designated by |dst|, and then copy the |src|
 // string to |dst|.  The caller owns both the |src| string and the
 // |dst| copy (hence the caller is responsible for eventually
@@ -63,7 +78,7 @@ bool CopyChromaticity(const PrimaryChromaticity* src,
   if (!dst)
     return false;
 
-  dst->reset(new (std::nothrow) PrimaryChromaticity(src->x, src->y));
+  dst->reset(new (std::nothrow) PrimaryChromaticity(src->x(), src->y()));
   if (!dst->get())
     return false;
 
@@ -80,36 +95,57 @@ IMkvWriter::IMkvWriter() {}
 
 IMkvWriter::~IMkvWriter() {}
 
-bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version) {
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version,
+                     const char* const doc_type) {
   // Level 0
-  uint64_t size = EbmlElementSize(libwebm::kMkvEBMLVersion, UINT64_C(1));
-  size += EbmlElementSize(libwebm::kMkvEBMLReadVersion, UINT64_C(1));
-  size += EbmlElementSize(libwebm::kMkvEBMLMaxIDLength, UINT64_C(4));
-  size += EbmlElementSize(libwebm::kMkvEBMLMaxSizeLength, UINT64_C(8));
-  size += EbmlElementSize(libwebm::kMkvDocType, "webm");
-  size += EbmlElementSize(libwebm::kMkvDocTypeVersion, doc_type_version);
-  size += EbmlElementSize(libwebm::kMkvDocTypeReadVersion, UINT64_C(2));
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvEBMLVersion, static_cast<uint64>(1));
+  size += EbmlElementSize(libwebm::kMkvEBMLReadVersion, static_cast<uint64>(1));
+  size += EbmlElementSize(libwebm::kMkvEBMLMaxIDLength, static_cast<uint64>(4));
+  size +=
+      EbmlElementSize(libwebm::kMkvEBMLMaxSizeLength, static_cast<uint64>(8));
+  size += EbmlElementSize(libwebm::kMkvDocType, doc_type);
+  size += EbmlElementSize(libwebm::kMkvDocTypeVersion,
+                          static_cast<uint64>(doc_type_version));
+  size +=
+      EbmlElementSize(libwebm::kMkvDocTypeReadVersion, static_cast<uint64>(2));
 
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvEBML, size))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLVersion, UINT64_C(1)))
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLVersion,
+                        static_cast<uint64>(1))) {
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLReadVersion, UINT64_C(1)))
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLReadVersion,
+                        static_cast<uint64>(1))) {
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxIDLength, UINT64_C(4)))
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxIDLength,
+                        static_cast<uint64>(4))) {
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxSizeLength, UINT64_C(8)))
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxSizeLength,
+                        static_cast<uint64>(8))) {
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvDocType, "webm"))
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocType, doc_type))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeVersion, doc_type_version))
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeVersion,
+                        static_cast<uint64>(doc_type_version))) {
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeReadVersion, UINT64_C(2)))
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeReadVersion,
+                        static_cast<uint64>(2))) {
     return false;
+  }
 
   return true;
 }
 
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version) {
+  return WriteEbmlHeader(writer, doc_type_version, kDocTypeWebm);
+}
+
 bool WriteEbmlHeader(IMkvWriter* writer) {
   return WriteEbmlHeader(writer, mkvmuxer::Segment::kDefaultDocTypeVersion);
 }
@@ -262,15 +298,17 @@ bool CuePoint::Write(IMkvWriter* writer) const {
   if (!writer || track_ < 1 || cluster_pos_ < 1)
     return false;
 
-  uint64_t size =
-      EbmlElementSize(libwebm::kMkvCueClusterPosition, cluster_pos_);
-  size += EbmlElementSize(libwebm::kMkvCueTrack, track_);
+  uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition,
+                                  static_cast<uint64>(cluster_pos_));
+  size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast<uint64>(track_));
   if (output_block_number_ && block_number_ > 1)
-    size += EbmlElementSize(libwebm::kMkvCueBlockNumber, block_number_);
+    size += EbmlElementSize(libwebm::kMkvCueBlockNumber,
+                            static_cast<uint64>(block_number_));
   const uint64_t track_pos_size =
       EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size;
   const uint64_t payload_size =
-      EbmlElementSize(libwebm::kMkvCueTime, time_) + track_pos_size;
+      EbmlElementSize(libwebm::kMkvCueTime, static_cast<uint64>(time_)) +
+      track_pos_size;
 
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvCuePoint, payload_size))
     return false;
@@ -279,18 +317,27 @@ bool CuePoint::Write(IMkvWriter* writer) const {
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer, libwebm::kMkvCueTime, time_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueTime,
+                        static_cast<uint64>(time_))) {
     return false;
+  }
 
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvCueTrackPositions, size))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvCueTrack, track_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueTrack,
+                        static_cast<uint64>(track_))) {
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvCueClusterPosition, cluster_pos_))
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueClusterPosition,
+                        static_cast<uint64>(cluster_pos_))) {
     return false;
-  if (output_block_number_ && block_number_ > 1)
-    if (!WriteEbmlElement(writer, libwebm::kMkvCueBlockNumber, block_number_))
+  }
+  if (output_block_number_ && block_number_ > 1) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvCueBlockNumber,
+                          static_cast<uint64>(block_number_))) {
       return false;
+    }
+  }
 
   const int64_t stop_position = writer->Position();
   if (stop_position < 0)
@@ -303,15 +350,17 @@ bool CuePoint::Write(IMkvWriter* writer) const {
 }
 
 uint64_t CuePoint::PayloadSize() const {
-  uint64_t size =
-      EbmlElementSize(libwebm::kMkvCueClusterPosition, cluster_pos_);
-  size += EbmlElementSize(libwebm::kMkvCueTrack, track_);
+  uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition,
+                                  static_cast<uint64>(cluster_pos_));
+  size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast<uint64>(track_));
   if (output_block_number_ && block_number_ > 1)
-    size += EbmlElementSize(libwebm::kMkvCueBlockNumber, block_number_);
+    size += EbmlElementSize(libwebm::kMkvCueBlockNumber,
+                            static_cast<uint64>(block_number_));
   const uint64_t track_pos_size =
       EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size;
   const uint64_t payload_size =
-      EbmlElementSize(libwebm::kMkvCueTime, time_) + track_pos_size;
+      EbmlElementSize(libwebm::kMkvCueTime, static_cast<uint64>(time_)) +
+      track_pos_size;
 
   return payload_size;
 }
@@ -456,8 +505,9 @@ bool ContentEncAESSettings::Write(IMkvWriter* writer) const {
     return false;
 
   if (!WriteEbmlElement(writer, libwebm::kMkvAESSettingsCipherMode,
-                        cipher_mode_))
+                        static_cast<uint64>(cipher_mode_))) {
     return false;
+  }
 
   const int64_t stop_position = writer->Position();
   if (stop_position < 0 ||
@@ -468,8 +518,8 @@ bool ContentEncAESSettings::Write(IMkvWriter* writer) const {
 }
 
 uint64_t ContentEncAESSettings::PayloadSize() const {
-  uint64_t size =
-      EbmlElementSize(libwebm::kMkvAESSettingsCipherMode, cipher_mode_);
+  uint64_t size = EbmlElementSize(libwebm::kMkvAESSettingsCipherMode,
+                                  static_cast<uint64>(cipher_mode_));
   return size;
 }
 
@@ -529,20 +579,22 @@ bool ContentEncoding::Write(IMkvWriter* writer) const {
                               encoding_size))
     return false;
   if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingOrder,
-                        encoding_order_))
+                        static_cast<uint64>(encoding_order_)))
     return false;
   if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingScope,
-                        encoding_scope_))
+                        static_cast<uint64>(encoding_scope_)))
     return false;
   if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingType,
-                        encoding_type_))
+                        static_cast<uint64>(encoding_type_)))
     return false;
 
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncryption,
                               encryption_size))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncAlgo, enc_algo_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncAlgo,
+                        static_cast<uint64>(enc_algo_))) {
     return false;
+  }
   if (!WriteEbmlElement(writer, libwebm::kMkvContentEncKeyID, enc_key_id_,
                         enc_key_id_length_))
     return false;
@@ -571,12 +623,12 @@ uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size,
         EbmlMasterElementSize(libwebm::kMkvContentEncryption, encryption_size) +
         encryption_size;
   }
-  encoding_size +=
-      EbmlElementSize(libwebm::kMkvContentEncodingType, encoding_type_);
-  encoding_size +=
-      EbmlElementSize(libwebm::kMkvContentEncodingScope, encoding_scope_);
-  encoding_size +=
-      EbmlElementSize(libwebm::kMkvContentEncodingOrder, encoding_order_);
+  encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingType,
+                                   static_cast<uint64>(encoding_type_));
+  encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingScope,
+                                   static_cast<uint64>(encoding_scope_));
+  encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingOrder,
+                                   static_cast<uint64>(encoding_order_));
 
   return encoding_size;
 }
@@ -586,7 +638,8 @@ uint64_t ContentEncoding::EncryptionSize() const {
 
   uint64_t encryption_size = EbmlElementSize(libwebm::kMkvContentEncKeyID,
                                              enc_key_id_, enc_key_id_length_);
-  encryption_size += EbmlElementSize(libwebm::kMkvContentEncAlgo, enc_algo_);
+  encryption_size += EbmlElementSize(libwebm::kMkvContentEncAlgo,
+                                     static_cast<uint64>(enc_algo_));
 
   return encryption_size + aes_size;
 }
@@ -664,9 +717,10 @@ ContentEncoding* Track::GetContentEncodingByIndex(uint32_t index) const {
 }
 
 uint64_t Track::PayloadSize() const {
-  uint64_t size = EbmlElementSize(libwebm::kMkvTrackNumber, number_);
-  size += EbmlElementSize(libwebm::kMkvTrackUID, uid_);
-  size += EbmlElementSize(libwebm::kMkvTrackType, type_);
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvTrackNumber, static_cast<uint64>(number_));
+  size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast<uint64>(uid_));
+  size += EbmlElementSize(libwebm::kMkvTrackType, static_cast<uint64>(type_));
   if (codec_id_)
     size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_);
   if (codec_private_)
@@ -676,15 +730,22 @@ uint64_t Track::PayloadSize() const {
     size += EbmlElementSize(libwebm::kMkvLanguage, language_);
   if (name_)
     size += EbmlElementSize(libwebm::kMkvName, name_);
-  if (max_block_additional_id_)
+  if (max_block_additional_id_) {
     size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID,
-                            max_block_additional_id_);
-  if (codec_delay_)
-    size += EbmlElementSize(libwebm::kMkvCodecDelay, codec_delay_);
-  if (seek_pre_roll_)
-    size += EbmlElementSize(libwebm::kMkvSeekPreRoll, seek_pre_roll_);
-  if (default_duration_)
-    size += EbmlElementSize(libwebm::kMkvDefaultDuration, default_duration_);
+                            static_cast<uint64>(max_block_additional_id_));
+  }
+  if (codec_delay_) {
+    size += EbmlElementSize(libwebm::kMkvCodecDelay,
+                            static_cast<uint64>(codec_delay_));
+  }
+  if (seek_pre_roll_) {
+    size += EbmlElementSize(libwebm::kMkvSeekPreRoll,
+                            static_cast<uint64>(seek_pre_roll_));
+  }
+  if (default_duration_) {
+    size += EbmlElementSize(libwebm::kMkvDefaultDuration,
+                            static_cast<uint64>(default_duration_));
+  }
 
   if (content_encoding_entries_size_ > 0) {
     uint64_t content_encodings_size = 0;
@@ -722,55 +783,64 @@ bool Track::Write(IMkvWriter* writer) const {
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvTrackEntry, payload_size))
     return false;
 
-  uint64_t size = EbmlElementSize(libwebm::kMkvTrackNumber, number_);
-  size += EbmlElementSize(libwebm::kMkvTrackUID, uid_);
-  size += EbmlElementSize(libwebm::kMkvTrackType, type_);
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvTrackNumber, static_cast<uint64>(number_));
+  size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast<uint64>(uid_));
+  size += EbmlElementSize(libwebm::kMkvTrackType, static_cast<uint64>(type_));
   if (codec_id_)
     size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_);
   if (codec_private_)
     size += EbmlElementSize(libwebm::kMkvCodecPrivate, codec_private_,
-                            codec_private_length_);
+                            static_cast<uint64>(codec_private_length_));
   if (language_)
     size += EbmlElementSize(libwebm::kMkvLanguage, language_);
   if (name_)
     size += EbmlElementSize(libwebm::kMkvName, name_);
   if (max_block_additional_id_)
     size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID,
-                            max_block_additional_id_);
+                            static_cast<uint64>(max_block_additional_id_));
   if (codec_delay_)
-    size += EbmlElementSize(libwebm::kMkvCodecDelay, codec_delay_);
+    size += EbmlElementSize(libwebm::kMkvCodecDelay,
+                            static_cast<uint64>(codec_delay_));
   if (seek_pre_roll_)
-    size += EbmlElementSize(libwebm::kMkvSeekPreRoll, seek_pre_roll_);
+    size += EbmlElementSize(libwebm::kMkvSeekPreRoll,
+                            static_cast<uint64>(seek_pre_roll_));
   if (default_duration_)
-    size += EbmlElementSize(libwebm::kMkvDefaultDuration, default_duration_);
+    size += EbmlElementSize(libwebm::kMkvDefaultDuration,
+                            static_cast<uint64>(default_duration_));
 
   const int64_t payload_position = writer->Position();
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer, libwebm::kMkvTrackNumber, number_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackNumber,
+                        static_cast<uint64>(number_)))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvTrackUID, uid_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackUID,
+                        static_cast<uint64>(uid_)))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvTrackType, type_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackType,
+                        static_cast<uint64>(type_)))
     return false;
   if (max_block_additional_id_) {
     if (!WriteEbmlElement(writer, libwebm::kMkvMaxBlockAdditionID,
-                          max_block_additional_id_)) {
+                          static_cast<uint64>(max_block_additional_id_))) {
       return false;
     }
   }
   if (codec_delay_) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvCodecDelay, codec_delay_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvCodecDelay,
+                          static_cast<uint64>(codec_delay_)))
       return false;
   }
   if (seek_pre_roll_) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvSeekPreRoll, seek_pre_roll_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvSeekPreRoll,
+                          static_cast<uint64>(seek_pre_roll_)))
       return false;
   }
   if (default_duration_) {
     if (!WriteEbmlElement(writer, libwebm::kMkvDefaultDuration,
-                          default_duration_))
+                          static_cast<uint64>(default_duration_)))
       return false;
   }
   if (codec_id_) {
@@ -779,7 +849,7 @@ bool Track::Write(IMkvWriter* writer) const {
   }
   if (codec_private_) {
     if (!WriteEbmlElement(writer, libwebm::kMkvCodecPrivate, codec_private_,
-                          codec_private_length_))
+                          static_cast<uint64>(codec_private_length_)))
       return false;
   }
   if (language_) {
@@ -890,14 +960,23 @@ void Track::set_name(const char* name) {
 //
 // Colour and its child elements
 
-uint64_t PrimaryChromaticity::PrimaryChromaticityPayloadSize(
+uint64_t PrimaryChromaticity::PrimaryChromaticitySize(
     libwebm::MkvId x_id, libwebm::MkvId y_id) const {
-  return EbmlElementSize(x_id, x) + EbmlElementSize(y_id, y);
+  return EbmlElementSize(x_id, x_) + EbmlElementSize(y_id, y_);
 }
 
 bool PrimaryChromaticity::Write(IMkvWriter* writer, libwebm::MkvId x_id,
                                 libwebm::MkvId y_id) const {
-  return WriteEbmlElement(writer, x_id, x) && WriteEbmlElement(writer, y_id, y);
+  if (!Valid()) {
+    return false;
+  }
+  return WriteEbmlElement(writer, x_id, x_) &&
+         WriteEbmlElement(writer, y_id, y_);
+}
+
+bool PrimaryChromaticity::Valid() const {
+  return (x_ >= kChromaticityMin && x_ <= kChromaticityMax &&
+          y_ >= kChromaticityMin && y_ <= kChromaticityMax);
 }
 
 uint64_t MasteringMetadata::MasteringMetadataSize() const {
@@ -909,6 +988,31 @@ uint64_t MasteringMetadata::MasteringMetadataSize() const {
   return size;
 }
 
+bool MasteringMetadata::Valid() const {
+  if (luminance_min_ != kValueNotPresent) {
+    if (luminance_min_ < kMinLuminance || luminance_min_ > kMinLuminanceMax ||
+        luminance_min_ > luminance_max_) {
+      return false;
+    }
+  }
+  if (luminance_max_ != kValueNotPresent) {
+    if (luminance_max_ < kMinLuminance || luminance_max_ > kMaxLuminanceMax ||
+        luminance_max_ < luminance_min_) {
+      return false;
+    }
+  }
+  if (r_ && !r_->Valid())
+    return false;
+  if (g_ && !g_->Valid())
+    return false;
+  if (b_ && !b_->Valid())
+    return false;
+  if (white_point_ && !white_point_->Valid())
+    return false;
+
+  return true;
+}
+
 bool MasteringMetadata::Write(IMkvWriter* writer) const {
   const uint64_t size = PayloadSize();
 
@@ -918,12 +1022,12 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const {
 
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvMasteringMetadata, size))
     return false;
-  if (luminance_max != kValueNotPresent &&
-      !WriteEbmlElement(writer, libwebm::kMkvLuminanceMax, luminance_max)) {
+  if (luminance_max_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvLuminanceMax, luminance_max_)) {
     return false;
   }
-  if (luminance_min != kValueNotPresent &&
-      !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min)) {
+  if (luminance_min_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) {
     return false;
   }
   if (r_ &&
@@ -984,25 +1088,25 @@ bool MasteringMetadata::SetChromaticity(
 uint64_t MasteringMetadata::PayloadSize() const {
   uint64_t size = 0;
 
-  if (luminance_max != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvLuminanceMax, luminance_max);
-  if (luminance_min != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvLuminanceMin, luminance_min);
+  if (luminance_max_ != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvLuminanceMax, luminance_max_);
+  if (luminance_min_ != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvLuminanceMin, luminance_min_);
 
   if (r_) {
-    size += r_->PrimaryChromaticityPayloadSize(
-        libwebm::kMkvPrimaryRChromaticityX, libwebm::kMkvPrimaryRChromaticityY);
+    size += r_->PrimaryChromaticitySize(libwebm::kMkvPrimaryRChromaticityX,
+                                        libwebm::kMkvPrimaryRChromaticityY);
   }
   if (g_) {
-    size += g_->PrimaryChromaticityPayloadSize(
-        libwebm::kMkvPrimaryGChromaticityX, libwebm::kMkvPrimaryGChromaticityY);
+    size += g_->PrimaryChromaticitySize(libwebm::kMkvPrimaryGChromaticityX,
+                                        libwebm::kMkvPrimaryGChromaticityY);
   }
   if (b_) {
-    size += b_->PrimaryChromaticityPayloadSize(
-        libwebm::kMkvPrimaryBChromaticityX, libwebm::kMkvPrimaryBChromaticityY);
+    size += b_->PrimaryChromaticitySize(libwebm::kMkvPrimaryBChromaticityX,
+                                        libwebm::kMkvPrimaryBChromaticityY);
   }
   if (white_point_) {
-    size += white_point_->PrimaryChromaticityPayloadSize(
+    size += white_point_->PrimaryChromaticitySize(
         libwebm::kMkvWhitePointChromaticityX,
         libwebm::kMkvWhitePointChromaticityY);
   }
@@ -1019,6 +1123,33 @@ uint64_t Colour::ColourSize() const {
   return size;
 }
 
+bool Colour::Valid() const {
+  if (mastering_metadata_ && !mastering_metadata_->Valid())
+    return false;
+  if (matrix_coefficients_ != kValueNotPresent &&
+      !IsMatrixCoefficientsValueValid(matrix_coefficients_)) {
+    return false;
+  }
+  if (chroma_siting_horz_ != kValueNotPresent &&
+      !IsChromaSitingHorzValueValid(chroma_siting_horz_)) {
+    return false;
+  }
+  if (chroma_siting_vert_ != kValueNotPresent &&
+      !IsChromaSitingVertValueValid(chroma_siting_vert_)) {
+    return false;
+  }
+  if (range_ != kValueNotPresent && !IsColourRangeValueValid(range_))
+    return false;
+  if (transfer_characteristics_ != kValueNotPresent &&
+      !IsTransferCharacteristicsValueValid(transfer_characteristics_)) {
+    return false;
+  }
+  if (primaries_ != kValueNotPresent && !IsPrimariesValueValid(primaries_))
+    return false;
+
+  return true;
+}
+
 bool Colour::Write(IMkvWriter* writer) const {
   const uint64_t size = PayloadSize();
 
@@ -1026,69 +1157,77 @@ bool Colour::Write(IMkvWriter* writer) const {
   if (size == 0)
     return true;
 
+  // Don't write an invalid element.
+  if (!Valid())
+    return false;
+
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvColour, size))
     return false;
 
-  if (matrix_coefficients != kValueNotPresent &&
+  if (matrix_coefficients_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvMatrixCoefficients,
-                        matrix_coefficients)) {
+                        static_cast<uint64>(matrix_coefficients_))) {
     return false;
   }
-  if (bits_per_channel != kValueNotPresent &&
+  if (bits_per_channel_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvBitsPerChannel,
-                        bits_per_channel)) {
+                        static_cast<uint64>(bits_per_channel_))) {
     return false;
   }
-  if (chroma_subsampling_horz != kValueNotPresent &&
+  if (chroma_subsampling_horz_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingHorz,
-                        chroma_subsampling_horz)) {
+                        static_cast<uint64>(chroma_subsampling_horz_))) {
     return false;
   }
-  if (chroma_subsampling_vert != kValueNotPresent &&
+  if (chroma_subsampling_vert_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingVert,
-                        chroma_subsampling_vert)) {
+                        static_cast<uint64>(chroma_subsampling_vert_))) {
     return false;
   }
 
-  if (cb_subsampling_horz != kValueNotPresent &&
+  if (cb_subsampling_horz_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingHorz,
-                        cb_subsampling_horz)) {
+                        static_cast<uint64>(cb_subsampling_horz_))) {
     return false;
   }
-  if (cb_subsampling_vert != kValueNotPresent &&
+  if (cb_subsampling_vert_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingVert,
-                        cb_subsampling_vert)) {
+                        static_cast<uint64>(cb_subsampling_vert_))) {
     return false;
   }
-  if (chroma_siting_horz != kValueNotPresent &&
+  if (chroma_siting_horz_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvChromaSitingHorz,
-                        chroma_siting_horz)) {
+                        static_cast<uint64>(chroma_siting_horz_))) {
     return false;
   }
-  if (chroma_siting_vert != kValueNotPresent &&
+  if (chroma_siting_vert_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvChromaSitingVert,
-                        chroma_siting_vert)) {
+                        static_cast<uint64>(chroma_siting_vert_))) {
     return false;
   }
-  if (range != kValueNotPresent &&
-      !WriteEbmlElement(writer, libwebm::kMkvRange, range)) {
+  if (range_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvRange,
+                        static_cast<uint64>(range_))) {
     return false;
   }
-  if (transfer_characteristics != kValueNotPresent &&
+  if (transfer_characteristics_ != kValueNotPresent &&
       !WriteEbmlElement(writer, libwebm::kMkvTransferCharacteristics,
-                        transfer_characteristics)) {
+                        static_cast<uint64>(transfer_characteristics_))) {
     return false;
   }
-  if (primaries != kValueNotPresent &&
-      !WriteEbmlElement(writer, libwebm::kMkvPrimaries, primaries)) {
+  if (primaries_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvPrimaries,
+                        static_cast<uint64>(primaries_))) {
     return false;
   }
-  if (max_cll != kValueNotPresent &&
-      !WriteEbmlElement(writer, libwebm::kMkvMaxCLL, max_cll)) {
+  if (max_cll_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvMaxCLL,
+                        static_cast<uint64>(max_cll_))) {
     return false;
   }
-  if (max_fall != kValueNotPresent &&
-      !WriteEbmlElement(writer, libwebm::kMkvMaxFALL, max_fall)) {
+  if (max_fall_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvMaxFALL,
+                        static_cast<uint64>(max_fall_))) {
     return false;
   }
 
@@ -1103,8 +1242,8 @@ bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) {
   if (!mm_ptr.get())
     return false;
 
-  mm_ptr->luminance_max = mastering_metadata.luminance_max;
-  mm_ptr->luminance_min = mastering_metadata.luminance_min;
+  mm_ptr->set_luminance_max(mastering_metadata.luminance_max());
+  mm_ptr->set_luminance_min(mastering_metadata.luminance_min());
 
   if (!mm_ptr->SetChromaticity(mastering_metadata.r(), mastering_metadata.g(),
                                mastering_metadata.b(),
@@ -1120,38 +1259,56 @@ bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) {
 uint64_t Colour::PayloadSize() const {
   uint64_t size = 0;
 
-  if (matrix_coefficients != kValueNotPresent)
-    size +=
-        EbmlElementSize(libwebm::kMkvMatrixCoefficients, matrix_coefficients);
-  if (bits_per_channel != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvBitsPerChannel, bits_per_channel);
-  if (chroma_subsampling_horz != kValueNotPresent)
+  if (matrix_coefficients_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvMatrixCoefficients,
+                            static_cast<uint64>(matrix_coefficients_));
+  }
+  if (bits_per_channel_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvBitsPerChannel,
+                            static_cast<uint64>(bits_per_channel_));
+  }
+  if (chroma_subsampling_horz_ != kValueNotPresent) {
     size += EbmlElementSize(libwebm::kMkvChromaSubsamplingHorz,
-                            chroma_subsampling_horz);
-  if (chroma_subsampling_vert != kValueNotPresent)
+                            static_cast<uint64>(chroma_subsampling_horz_));
+  }
+  if (chroma_subsampling_vert_ != kValueNotPresent) {
     size += EbmlElementSize(libwebm::kMkvChromaSubsamplingVert,
-                            chroma_subsampling_vert);
-  if (cb_subsampling_horz != kValueNotPresent)
-    size +=
-        EbmlElementSize(libwebm::kMkvCbSubsamplingHorz, cb_subsampling_horz);
-  if (cb_subsampling_vert != kValueNotPresent)
-    size +=
-        EbmlElementSize(libwebm::kMkvCbSubsamplingVert, cb_subsampling_vert);
-  if (chroma_siting_horz != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvChromaSitingHorz, chroma_siting_horz);
-  if (chroma_siting_vert != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvChromaSitingVert, chroma_siting_vert);
-  if (range != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvRange, range);
-  if (transfer_characteristics != kValueNotPresent)
+                            static_cast<uint64>(chroma_subsampling_vert_));
+  }
+  if (cb_subsampling_horz_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvCbSubsamplingHorz,
+                            static_cast<uint64>(cb_subsampling_horz_));
+  }
+  if (cb_subsampling_vert_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvCbSubsamplingVert,
+                            static_cast<uint64>(cb_subsampling_vert_));
+  }
+  if (chroma_siting_horz_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvChromaSitingHorz,
+                            static_cast<uint64>(chroma_siting_horz_));
+  }
+  if (chroma_siting_vert_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvChromaSitingVert,
+                            static_cast<uint64>(chroma_siting_vert_));
+  }
+  if (range_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvRange, static_cast<uint64>(range_));
+  }
+  if (transfer_characteristics_ != kValueNotPresent) {
     size += EbmlElementSize(libwebm::kMkvTransferCharacteristics,
-                            transfer_characteristics);
-  if (primaries != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvPrimaries, primaries);
-  if (max_cll != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvMaxCLL, max_cll);
-  if (max_fall != kValueNotPresent)
-    size += EbmlElementSize(libwebm::kMkvMaxFALL, max_fall);
+                            static_cast<uint64>(transfer_characteristics_));
+  }
+  if (primaries_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvPrimaries,
+                            static_cast<uint64>(primaries_));
+  }
+  if (max_cll_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvMaxCLL, static_cast<uint64>(max_cll_));
+  }
+  if (max_fall_ != kValueNotPresent) {
+    size +=
+        EbmlElementSize(libwebm::kMkvMaxFALL, static_cast<uint64>(max_fall_));
+  }
 
   if (mastering_metadata_)
     size += mastering_metadata_->MasteringMetadataSize();
@@ -1159,6 +1316,95 @@ uint64_t Colour::PayloadSize() const {
   return size;
 }
 
+///////////////////////////////////////////////////////////////
+//
+// Projection element
+
+uint64_t Projection::ProjectionSize() const {
+  uint64_t size = PayloadSize();
+
+  if (size > 0)
+    size += EbmlMasterElementSize(libwebm::kMkvProjection, size);
+
+  return size;
+}
+
+bool Projection::Write(IMkvWriter* writer) const {
+  const uint64_t size = PayloadSize();
+
+  // Don't write an empty element.
+  if (size == 0)
+    return true;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvProjection, size))
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionType,
+                        static_cast<uint64>(type_))) {
+    return false;
+  }
+
+  if (private_data_length_ > 0 && private_data_ != NULL &&
+      !WriteEbmlElement(writer, libwebm::kMkvProjectionPrivate, private_data_,
+                        private_data_length_)) {
+    return false;
+  }
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseYaw, pose_yaw_))
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPosePitch,
+                        pose_pitch_)) {
+    return false;
+  }
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseRoll, pose_roll_)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool Projection::SetProjectionPrivate(const uint8_t* data,
+                                      uint64_t data_length) {
+  if (data == NULL || data_length == 0) {
+    return false;
+  }
+
+  if (data_length != static_cast<size_t>(data_length)) {
+    return false;
+  }
+
+  uint8_t* new_private_data =
+      new (std::nothrow) uint8_t[static_cast<size_t>(data_length)];
+  if (new_private_data == NULL) {
+    return false;
+  }
+
+  delete[] private_data_;
+  private_data_ = new_private_data;
+  private_data_length_ = data_length;
+  memcpy(private_data_, data, static_cast<size_t>(data_length));
+
+  return true;
+}
+
+uint64_t Projection::PayloadSize() const {
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvProjection, static_cast<uint64>(type_));
+
+  if (private_data_length_ > 0 && private_data_ != NULL) {
+    size += EbmlElementSize(libwebm::kMkvProjectionPrivate, private_data_,
+                            private_data_length_);
+  }
+
+  size += EbmlElementSize(libwebm::kMkvProjectionPoseYaw, pose_yaw_);
+  size += EbmlElementSize(libwebm::kMkvProjectionPosePitch, pose_pitch_);
+  size += EbmlElementSize(libwebm::kMkvProjectionPoseRoll, pose_roll_);
+
+  return size;
+}
+
 ///////////////////////////////////////////////////////////////
 //
 // VideoTrack Class
@@ -1167,6 +1413,8 @@ VideoTrack::VideoTrack(unsigned int* seed)
     : Track(seed),
       display_height_(0),
       display_width_(0),
+      pixel_height_(0),
+      pixel_width_(0),
       crop_left_(0),
       crop_right_(0),
       crop_top_(0),
@@ -1176,9 +1424,13 @@ VideoTrack::VideoTrack(unsigned int* seed)
       stereo_mode_(0),
       alpha_mode_(0),
       width_(0),
-      colour_(NULL) {}
+      colour_(NULL),
+      projection_(NULL) {}
 
-VideoTrack::~VideoTrack() { delete colour_; }
+VideoTrack::~VideoTrack() {
+  delete colour_;
+  delete projection_;
+}
 
 bool VideoTrack::SetStereoMode(uint64_t stereo_mode) {
   if (stereo_mode != kMono && stereo_mode != kSideBySideLeftIsFirst &&
@@ -1221,40 +1473,52 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer, libwebm::kMkvPixelWidth, width_))
+  if (!WriteEbmlElement(
+          writer, libwebm::kMkvPixelWidth,
+          static_cast<uint64>((pixel_width_ > 0) ? pixel_width_ : width_)))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvPixelHeight, height_))
+  if (!WriteEbmlElement(
+          writer, libwebm::kMkvPixelHeight,
+          static_cast<uint64>((pixel_height_ > 0) ? pixel_height_ : height_)))
     return false;
   if (display_width_ > 0) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvDisplayWidth, display_width_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvDisplayWidth,
+                          static_cast<uint64>(display_width_)))
       return false;
   }
   if (display_height_ > 0) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvDisplayHeight, display_height_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvDisplayHeight,
+                          static_cast<uint64>(display_height_)))
       return false;
   }
   if (crop_left_ > 0) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropLeft, crop_left_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropLeft,
+                          static_cast<uint64>(crop_left_)))
       return false;
   }
   if (crop_right_ > 0) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropRight, crop_right_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropRight,
+                          static_cast<uint64>(crop_right_)))
       return false;
   }
   if (crop_top_ > 0) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropTop, crop_top_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropTop,
+                          static_cast<uint64>(crop_top_)))
       return false;
   }
   if (crop_bottom_ > 0) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropBottom, crop_bottom_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropBottom,
+                          static_cast<uint64>(crop_bottom_)))
       return false;
   }
   if (stereo_mode_ > kMono) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvStereoMode, stereo_mode_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvStereoMode,
+                          static_cast<uint64>(stereo_mode_)))
       return false;
   }
   if (alpha_mode_ > kNoAlpha) {
-    if (!WriteEbmlElement(writer, libwebm::kMkvAlphaMode, alpha_mode_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvAlphaMode,
+                          static_cast<uint64>(alpha_mode_)))
       return false;
   }
   if (frame_rate_ > 0.0) {
@@ -1267,6 +1531,10 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
     if (!colour_->Write(writer))
       return false;
   }
+  if (projection_) {
+    if (!projection_->Write(writer))
+      return false;
+  }
 
   const int64_t stop_position = writer->Position();
   if (stop_position < 0 ||
@@ -1287,47 +1555,83 @@ bool VideoTrack::SetColour(const Colour& colour) {
       return false;
   }
 
-  colour_ptr->matrix_coefficients = colour.matrix_coefficients;
-  colour_ptr->bits_per_channel = colour.bits_per_channel;
-  colour_ptr->chroma_subsampling_horz = colour.chroma_subsampling_horz;
-  colour_ptr->chroma_subsampling_vert = colour.chroma_subsampling_vert;
-  colour_ptr->cb_subsampling_horz = colour.cb_subsampling_horz;
-  colour_ptr->cb_subsampling_vert = colour.cb_subsampling_vert;
-  colour_ptr->chroma_siting_horz = colour.chroma_siting_horz;
-  colour_ptr->chroma_siting_vert = colour.chroma_siting_vert;
-  colour_ptr->range = colour.range;
-  colour_ptr->transfer_characteristics = colour.transfer_characteristics;
-  colour_ptr->primaries = colour.primaries;
-  colour_ptr->max_cll = colour.max_cll;
-  colour_ptr->max_fall = colour.max_fall;
+  colour_ptr->set_matrix_coefficients(colour.matrix_coefficients());
+  colour_ptr->set_bits_per_channel(colour.bits_per_channel());
+  colour_ptr->set_chroma_subsampling_horz(colour.chroma_subsampling_horz());
+  colour_ptr->set_chroma_subsampling_vert(colour.chroma_subsampling_vert());
+  colour_ptr->set_cb_subsampling_horz(colour.cb_subsampling_horz());
+  colour_ptr->set_cb_subsampling_vert(colour.cb_subsampling_vert());
+  colour_ptr->set_chroma_siting_horz(colour.chroma_siting_horz());
+  colour_ptr->set_chroma_siting_vert(colour.chroma_siting_vert());
+  colour_ptr->set_range(colour.range());
+  colour_ptr->set_transfer_characteristics(colour.transfer_characteristics());
+  colour_ptr->set_primaries(colour.primaries());
+  colour_ptr->set_max_cll(colour.max_cll());
+  colour_ptr->set_max_fall(colour.max_fall());
+  delete colour_;
   colour_ = colour_ptr.release();
   return true;
 }
 
+bool VideoTrack::SetProjection(const Projection& projection) {
+  std::auto_ptr<Projection> projection_ptr(new Projection());
+  if (!projection_ptr.get())
+    return false;
+
+  if (projection.private_data()) {
+    if (!projection_ptr->SetProjectionPrivate(
+            projection.private_data(), projection.private_data_length())) {
+      return false;
+    }
+  }
+
+  projection_ptr->set_type(projection.type());
+  projection_ptr->set_pose_yaw(projection.pose_yaw());
+  projection_ptr->set_pose_pitch(projection.pose_pitch());
+  projection_ptr->set_pose_roll(projection.pose_roll());
+  delete projection_;
+  projection_ = projection_ptr.release();
+  return true;
+}
+
 uint64_t VideoTrack::VideoPayloadSize() const {
-  uint64_t size = EbmlElementSize(libwebm::kMkvPixelWidth, width_);
-  size += EbmlElementSize(libwebm::kMkvPixelHeight, height_);
+  uint64_t size = EbmlElementSize(
+      libwebm::kMkvPixelWidth,
+      static_cast<uint64>((pixel_width_ > 0) ? pixel_width_ : width_));
+  size += EbmlElementSize(
+      libwebm::kMkvPixelHeight,
+      static_cast<uint64>((pixel_height_ > 0) ? pixel_height_ : height_));
   if (display_width_ > 0)
-    size += EbmlElementSize(libwebm::kMkvDisplayWidth, display_width_);
+    size += EbmlElementSize(libwebm::kMkvDisplayWidth,
+                            static_cast<uint64>(display_width_));
   if (display_height_ > 0)
-    size += EbmlElementSize(libwebm::kMkvDisplayHeight, display_height_);
+    size += EbmlElementSize(libwebm::kMkvDisplayHeight,
+                            static_cast<uint64>(display_height_));
   if (crop_left_ > 0)
-    size += EbmlElementSize(libwebm::kMkvPixelCropLeft, crop_left_);
+    size += EbmlElementSize(libwebm::kMkvPixelCropLeft,
+                            static_cast<uint64>(crop_left_));
   if (crop_right_ > 0)
-    size += EbmlElementSize(libwebm::kMkvPixelCropRight, crop_right_);
+    size += EbmlElementSize(libwebm::kMkvPixelCropRight,
+                            static_cast<uint64>(crop_right_));
   if (crop_top_ > 0)
-    size += EbmlElementSize(libwebm::kMkvPixelCropTop, crop_top_);
+    size += EbmlElementSize(libwebm::kMkvPixelCropTop,
+                            static_cast<uint64>(crop_top_));
   if (crop_bottom_ > 0)
-    size += EbmlElementSize(libwebm::kMkvPixelCropBottom, crop_bottom_);
+    size += EbmlElementSize(libwebm::kMkvPixelCropBottom,
+                            static_cast<uint64>(crop_bottom_));
   if (stereo_mode_ > kMono)
-    size += EbmlElementSize(libwebm::kMkvStereoMode, stereo_mode_);
+    size += EbmlElementSize(libwebm::kMkvStereoMode,
+                            static_cast<uint64>(stereo_mode_));
   if (alpha_mode_ > kNoAlpha)
-    size += EbmlElementSize(libwebm::kMkvAlphaMode, alpha_mode_);
+    size += EbmlElementSize(libwebm::kMkvAlphaMode,
+                            static_cast<uint64>(alpha_mode_));
   if (frame_rate_ > 0.0)
     size += EbmlElementSize(libwebm::kMkvFrameRate,
                             static_cast<float>(frame_rate_));
   if (colour_)
     size += colour_->ColourSize();
+  if (projection_)
+    size += projection_->ProjectionSize();
 
   return size;
 }
@@ -1346,9 +1650,11 @@ uint64_t AudioTrack::PayloadSize() const {
 
   uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency,
                                   static_cast<float>(sample_rate_));
-  size += EbmlElementSize(libwebm::kMkvChannels, channels_);
+  size +=
+      EbmlElementSize(libwebm::kMkvChannels, static_cast<uint64>(channels_));
   if (bit_depth_ > 0)
-    size += EbmlElementSize(libwebm::kMkvBitDepth, bit_depth_);
+    size +=
+        EbmlElementSize(libwebm::kMkvBitDepth, static_cast<uint64>(bit_depth_));
   size += EbmlMasterElementSize(libwebm::kMkvAudio, size);
 
   return parent_size + size;
@@ -1361,9 +1667,11 @@ bool AudioTrack::Write(IMkvWriter* writer) const {
   // Calculate AudioSettings size.
   uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency,
                                   static_cast<float>(sample_rate_));
-  size += EbmlElementSize(libwebm::kMkvChannels, channels_);
+  size +=
+      EbmlElementSize(libwebm::kMkvChannels, static_cast<uint64>(channels_));
   if (bit_depth_ > 0)
-    size += EbmlElementSize(libwebm::kMkvBitDepth, bit_depth_);
+    size +=
+        EbmlElementSize(libwebm::kMkvBitDepth, static_cast<uint64>(bit_depth_));
 
   if (!WriteEbmlMasterElement(writer, libwebm::kMkvAudio, size))
     return false;
@@ -1375,10 +1683,12 @@ bool AudioTrack::Write(IMkvWriter* writer) const {
   if (!WriteEbmlElement(writer, libwebm::kMkvSamplingFrequency,
                         static_cast<float>(sample_rate_)))
     return false;
-  if (!WriteEbmlElement(writer, libwebm::kMkvChannels, channels_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvChannels,
+                        static_cast<uint64>(channels_)))
     return false;
   if (bit_depth_ > 0)
-    if (!WriteEbmlElement(writer, libwebm::kMkvBitDepth, bit_depth_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvBitDepth,
+                          static_cast<uint64>(bit_depth_)))
       return false;
 
   const int64_t stop_position = writer->Position();
@@ -1398,6 +1708,10 @@ const char Tracks::kVorbisCodecId[] = "A_VORBIS";
 const char Tracks::kVp8CodecId[] = "V_VP8";
 const char Tracks::kVp9CodecId[] = "V_VP9";
 const char Tracks::kVp10CodecId[] = "V_VP10";
+const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS";
+const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS";
+const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA";
+const char Tracks::kWebVttSubtitlesId[] = "D_WEBVTT/SUBTITLES";
 
 Tracks::Tracks()
     : track_entries_(NULL), track_entries_size_(0), wrote_tracks_(false) {}
@@ -1650,9 +1964,11 @@ bool Chapter::ExpandDisplaysArray() {
 uint64_t Chapter::WriteAtom(IMkvWriter* writer) const {
   uint64_t payload_size =
       EbmlElementSize(libwebm::kMkvChapterStringUID, id_) +
-      EbmlElementSize(libwebm::kMkvChapterUID, uid_) +
-      EbmlElementSize(libwebm::kMkvChapterTimeStart, start_timecode_) +
-      EbmlElementSize(libwebm::kMkvChapterTimeEnd, end_timecode_);
+      EbmlElementSize(libwebm::kMkvChapterUID, static_cast<uint64>(uid_)) +
+      EbmlElementSize(libwebm::kMkvChapterTimeStart,
+                      static_cast<uint64>(start_timecode_)) +
+      EbmlElementSize(libwebm::kMkvChapterTimeEnd,
+                      static_cast<uint64>(end_timecode_));
 
   for (int idx = 0; idx < displays_count_; ++idx) {
     const Display& d = displays_[idx];
@@ -1674,13 +1990,16 @@ uint64_t Chapter::WriteAtom(IMkvWriter* writer) const {
   if (!WriteEbmlElement(writer, libwebm::kMkvChapterStringUID, id_))
     return 0;
 
-  if (!WriteEbmlElement(writer, libwebm::kMkvChapterUID, uid_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterUID,
+                        static_cast<uint64>(uid_)))
     return 0;
 
-  if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeStart, start_timecode_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeStart,
+                        static_cast<uint64>(start_timecode_)))
     return 0;
 
-  if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeEnd, end_timecode_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeEnd,
+                        static_cast<uint64>(end_timecode_)))
     return 0;
 
   for (int idx = 0; idx < displays_count_; ++idx) {
@@ -2125,7 +2444,17 @@ Cluster::Cluster(uint64_t timecode, int64_t cues_pos, uint64_t timecode_scale,
       write_last_frame_with_duration_(write_last_frame_with_duration),
       writer_(NULL) {}
 
-Cluster::~Cluster() {}
+Cluster::~Cluster() {
+  // Delete any stored frames that are left behind. This will happen if the
+  // Cluster was not Finalized for whatever reason.
+  while (!stored_frames_.empty()) {
+    while (!stored_frames_.begin()->second.empty()) {
+      delete stored_frames_.begin()->second.front();
+      stored_frames_.begin()->second.pop_front();
+    }
+    stored_frames_.erase(stored_frames_.begin()->first);
+  }
+}
 
 bool Cluster::Init(IMkvWriter* ptr_writer) {
   if (!ptr_writer) {
@@ -2421,10 +2750,10 @@ bool SeekHead::Finalize(IMkvWriter* writer) const {
 
     for (int32_t i = 0; i < kSeekEntryCount; ++i) {
       if (seek_entry_id_[i] != 0) {
-        entry_size[i] = EbmlElementSize(
-            libwebm::kMkvSeekID, static_cast<uint64_t>(seek_entry_id_[i]));
-        entry_size[i] +=
-            EbmlElementSize(libwebm::kMkvSeekPosition, seek_entry_pos_[i]);
+        entry_size[i] = EbmlElementSize(libwebm::kMkvSeekID,
+                                        static_cast<uint64>(seek_entry_id_[i]));
+        entry_size[i] += EbmlElementSize(
+            libwebm::kMkvSeekPosition, static_cast<uint64>(seek_entry_pos_[i]));
 
         payload_size +=
             EbmlMasterElementSize(libwebm::kMkvSeek, entry_size[i]) +
@@ -2449,11 +2778,11 @@ bool SeekHead::Finalize(IMkvWriter* writer) const {
           return false;
 
         if (!WriteEbmlElement(writer, libwebm::kMkvSeekID,
-                              static_cast<uint64_t>(seek_entry_id_[i])))
+                              static_cast<uint64>(seek_entry_id_[i])))
           return false;
 
         if (!WriteEbmlElement(writer, libwebm::kMkvSeekPosition,
-                              seek_entry_pos_[i]))
+                              static_cast<uint64>(seek_entry_pos_[i])))
           return false;
       }
     }
@@ -2522,8 +2851,10 @@ bool SeekHead::SetSeekEntry(int index, uint32_t id, uint64_t position) {
 
 uint64_t SeekHead::MaxEntrySize() const {
   const uint64_t max_entry_payload_size =
-      EbmlElementSize(libwebm::kMkvSeekID, UINT64_C(0xffffffff)) +
-      EbmlElementSize(libwebm::kMkvSeekPosition, UINT64_C(0xffffffffffffffff));
+      EbmlElementSize(libwebm::kMkvSeekID,
+                      static_cast<uint64>(UINT64_C(0xffffffff))) +
+      EbmlElementSize(libwebm::kMkvSeekPosition,
+                      static_cast<uint64>(UINT64_C(0xffffffffffffffff)));
   const uint64_t max_entry_size =
       EbmlMasterElementSize(libwebm::kMkvSeek, max_entry_payload_size) +
       max_entry_payload_size;
@@ -2613,7 +2944,8 @@ bool SegmentInfo::Write(IMkvWriter* writer) {
   if (!writer || !muxing_app_ || !writing_app_)
     return false;
 
-  uint64_t size = EbmlElementSize(libwebm::kMkvTimecodeScale, timecode_scale_);
+  uint64_t size = EbmlElementSize(libwebm::kMkvTimecodeScale,
+                                  static_cast<uint64>(timecode_scale_));
   if (duration_ > 0.0)
     size +=
         EbmlElementSize(libwebm::kMkvDuration, static_cast<float>(duration_));
@@ -2629,7 +2961,8 @@ bool SegmentInfo::Write(IMkvWriter* writer) {
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer, libwebm::kMkvTimecodeScale, timecode_scale_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvTimecodeScale,
+                        static_cast<uint64>(timecode_scale_)))
     return false;
 
   if (duration_ > 0.0) {
@@ -2725,10 +3058,12 @@ Segment::Segment()
       output_cues_(true),
       accurate_cluster_duration_(false),
       fixed_size_cluster_timecode_(false),
+      estimate_file_duration_(false),
       payload_pos_(0),
       size_position_(0),
       doc_type_version_(kDefaultDocTypeVersion),
       doc_type_version_written_(0),
+      duration_(0.0),
       writer_cluster_(NULL),
       writer_cues_(NULL),
       writer_header_(NULL) {
@@ -2833,6 +3168,10 @@ bool Segment::Init(IMkvWriter* ptr_writer) {
   writer_cluster_ = ptr_writer;
   writer_cues_ = ptr_writer;
   writer_header_ = ptr_writer;
+  memset(&track_frames_written_, 0,
+         sizeof(track_frames_written_[0]) * kMaxTrackNumber);
+  memset(&last_track_timestamp_, 0,
+         sizeof(last_track_timestamp_[0]) * kMaxTrackNumber);
   return segment_info_.Init();
 }
 
@@ -2876,7 +3215,10 @@ bool Segment::Finalize() {
   if (WriteFramesAll() < 0)
     return false;
 
-  if (cluster_list_size_ > 0) {
+  // In kLive mode, call Cluster::Finalize only if |accurate_cluster_duration_|
+  // is set. In all other modes, always call Cluster::Finalize.
+  if ((mode_ == kLive ? accurate_cluster_duration_ : true) &&
+      cluster_list_size_ > 0) {
     // Update last cluster's size
     Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
 
@@ -2892,9 +3234,30 @@ bool Segment::Finalize() {
       chunk_count_++;
     }
 
-    const double duration =
+    double duration =
         (static_cast<double>(last_timestamp_) + last_block_duration_) /
         segment_info_.timecode_scale();
+    if (duration_ > 0.0) {
+      duration = duration_;
+    } else {
+      if (last_block_duration_ == 0 && estimate_file_duration_) {
+        const int num_tracks = static_cast<int>(tracks_.track_entries_size());
+        for (int i = 0; i < num_tracks; ++i) {
+          if (track_frames_written_[i] < 2)
+            continue;
+
+          // Estimate the duration for the last block of a Track.
+          const double nano_per_frame =
+              static_cast<double>(last_track_timestamp_[i]) /
+              (track_frames_written_[i] - 1);
+          const double track_duration =
+              (last_track_timestamp_[i] + nano_per_frame) /
+              segment_info_.timecode_scale();
+          if (track_duration > duration)
+            duration = track_duration;
+        }
+      }
+    }
     segment_info_.set_duration(duration);
     if (!segment_info_.Finalize(writer_header_))
       return false;
@@ -2941,7 +3304,9 @@ bool Segment::Finalize() {
         if (writer_header_->Position(0))
           return false;
 
-        if (!WriteEbmlHeader(writer_header_, doc_type_version_))
+        const char* const doc_type =
+            DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska;
+        if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type))
           return false;
         if (writer_header_->Position() != ebml_header_size_)
           return false;
@@ -3001,7 +3366,10 @@ uint64_t Segment::AddVideoTrack(int32_t width, int32_t height, int32_t number) {
   track->set_width(width);
   track->set_height(height);
 
-  tracks_.AddTrack(track, number);
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return 0;
+  }
   has_video_ = true;
 
   return track->number();
@@ -3023,8 +3391,10 @@ bool Segment::AddCuePoint(uint64_t timestamp, uint64_t track) {
   cue->set_block_number(cluster->blocks_added());
   cue->set_cluster_pos(cluster->position_for_cues());
   cue->set_track(track);
-  if (!cues_.AddCue(cue))
+  if (!cues_.AddCue(cue)) {
+    delete cue;
     return false;
+  }
 
   new_cuepoint_ = false;
   return true;
@@ -3041,7 +3411,10 @@ uint64_t Segment::AddAudioTrack(int32_t sample_rate, int32_t channels,
   track->set_sample_rate(sample_rate);
   track->set_channels(channels);
 
-  tracks_.AddTrack(track, number);
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return 0;
+  }
 
   return track->number();
 }
@@ -3130,15 +3503,35 @@ bool Segment::AddGenericFrame(const Frame* frame) {
   if (frame->discard_padding() != 0)
     doc_type_version_ = 4;
 
+  if (cluster_list_size_ > 0) {
+    const uint64_t timecode_scale = segment_info_.timecode_scale();
+    const uint64_t frame_timecode = frame->timestamp() / timecode_scale;
+
+    const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1];
+    const uint64_t last_cluster_timecode = last_cluster->timecode();
+
+    const uint64_t rel_timecode = frame_timecode - last_cluster_timecode;
+    if (rel_timecode > kMaxBlockTimecode) {
+      force_new_cluster_ = true;
+    }
+  }
+
   // If the segment has a video track hold onto audio frames to make sure the
   // audio that is associated with the start time of a video key-frame is
   // muxed into the same cluster.
   if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) &&
       !force_new_cluster_) {
     Frame* const new_frame = new (std::nothrow) Frame();
-    if (!new_frame || !new_frame->CopyFrom(*frame))
+    if (!new_frame || !new_frame->CopyFrom(*frame)) {
+      delete new_frame;
       return false;
-    return QueueFrame(new_frame);
+    }
+    if (!QueueFrame(new_frame)) {
+      delete new_frame;
+      return false;
+    }
+    track_frames_written_[frame->track_number() - 1]++;
+    return true;
   }
 
   if (!DoNewClusterProcessing(frame->track_number(), frame->timestamp(),
@@ -3159,8 +3552,10 @@ bool Segment::AddGenericFrame(const Frame* frame) {
   if (!frame->CanBeSimpleBlock() && !frame->is_key() &&
       !frame->reference_block_timestamp_set()) {
     Frame* const new_frame = new (std::nothrow) Frame();
-    if (!new_frame->CopyFrom(*frame))
+    if (!new_frame || !new_frame->CopyFrom(*frame)) {
+      delete new_frame;
       return false;
+    }
     new_frame->set_reference_block_timestamp(
         last_track_timestamp_[frame->track_number() - 1]);
     frame = new_frame;
@@ -3178,10 +3573,10 @@ bool Segment::AddGenericFrame(const Frame* frame) {
   last_timestamp_ = frame->timestamp();
   last_track_timestamp_[frame->track_number() - 1] = frame->timestamp();
   last_block_duration_ = frame->duration();
+  track_frames_written_[frame->track_number() - 1]++;
 
   if (frame_created)
     delete frame;
-
   return true;
 }
 
@@ -3292,8 +3687,9 @@ Track* Segment::GetTrackByNumber(uint64_t track_number) const {
 bool Segment::WriteSegmentHeader() {
   UpdateDocTypeVersion();
 
-  // TODO(fgalligan): Support more than one segment.
-  if (!WriteEbmlHeader(writer_header_, doc_type_version_))
+  const char* const doc_type =
+      DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska;
+  if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type))
     return false;
   doc_type_version_written_ = doc_type_version_;
   ebml_header_size_ = static_cast<int32_t>(writer_header_->Position());
@@ -3766,4 +4162,35 @@ bool Segment::WriteFramesLessThan(uint64_t timestamp) {
   return true;
 }
 
+bool Segment::DocTypeIsWebm() const {
+  const int kNumCodecIds = 9;
+
+  // TODO(vigneshv): Tweak .clang-format.
+  const char* kWebmCodecIds[kNumCodecIds] = {
+      Tracks::kOpusCodecId,          Tracks::kVorbisCodecId,
+      Tracks::kVp8CodecId,           Tracks::kVp9CodecId,
+      Tracks::kVp10CodecId,          Tracks::kWebVttCaptionsId,
+      Tracks::kWebVttDescriptionsId, Tracks::kWebVttMetadataId,
+      Tracks::kWebVttSubtitlesId};
+
+  const int num_tracks = static_cast<int>(tracks_.track_entries_size());
+  for (int track_index = 0; track_index < num_tracks; ++track_index) {
+    const Track* const track = tracks_.GetTrackByIndex(track_index);
+    const std::string codec_id = track->codec_id();
+
+    bool id_is_webm = false;
+    for (int id_index = 0; id_index < kNumCodecIds; ++id_index) {
+      if (codec_id == kWebmCodecIds[id_index]) {
+        id_is_webm = true;
+        break;
+      }
+    }
+
+    if (!id_is_webm)
+      return false;
+  }
+
+  return true;
+}
+
 }  // namespace mkvmuxer
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h
index 55ba07196d..46b0029dc4 100644
--- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h
+++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h
@@ -64,6 +64,12 @@ class IMkvWriter {
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(IMkvWriter);
 };
 
+// Writes out the EBML header for a WebM file, but allows caller to specify
+// DocType. This function must be called before any other libwebm writing
+// functions are called.
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version,
+                     const char* const doc_type);
+
 // Writes out the EBML header for a WebM file. This function must be called
 // before any other libwebm writing functions are called.
 bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version);
@@ -348,26 +354,42 @@ class ContentEncoding {
 
 ///////////////////////////////////////////////////////////////
 // Colour element.
-struct PrimaryChromaticity {
-  PrimaryChromaticity(float x_val, float y_val) : x(x_val), y(y_val) {}
-  PrimaryChromaticity() : x(0), y(0) {}
+class PrimaryChromaticity {
+ public:
+  static const float kChromaticityMin;
+  static const float kChromaticityMax;
+
+  PrimaryChromaticity(float x_val, float y_val) : x_(x_val), y_(y_val) {}
+  PrimaryChromaticity() : x_(0), y_(0) {}
   ~PrimaryChromaticity() {}
-  uint64_t PrimaryChromaticityPayloadSize(libwebm::MkvId x_id,
-                                          libwebm::MkvId y_id) const;
+
+  // Returns sum of |x_id| and |y_id| element id sizes and payload sizes.
+  uint64_t PrimaryChromaticitySize(libwebm::MkvId x_id,
+                                   libwebm::MkvId y_id) const;
+  bool Valid() const;
   bool Write(IMkvWriter* writer, libwebm::MkvId x_id,
              libwebm::MkvId y_id) const;
 
-  float x;
-  float y;
+  float x() const { return x_; }
+  void set_x(float new_x) { x_ = new_x; }
+  float y() const { return y_; }
+  void set_y(float new_y) { y_ = new_y; }
+
+ private:
+  float x_;
+  float y_;
 };
 
 class MasteringMetadata {
  public:
   static const float kValueNotPresent;
+  static const float kMinLuminance;
+  static const float kMinLuminanceMax;
+  static const float kMaxLuminanceMax;
 
   MasteringMetadata()
-      : luminance_max(kValueNotPresent),
-        luminance_min(kValueNotPresent),
+      : luminance_max_(kValueNotPresent),
+        luminance_min_(kValueNotPresent),
         r_(NULL),
         g_(NULL),
         b_(NULL),
@@ -381,6 +403,7 @@ class MasteringMetadata {
 
   // Returns total size of the MasteringMetadata element.
   uint64_t MasteringMetadataSize() const;
+  bool Valid() const;
   bool Write(IMkvWriter* writer) const;
 
   // Copies non-null chromaticity.
@@ -393,13 +416,21 @@ class MasteringMetadata {
   const PrimaryChromaticity* b() const { return b_; }
   const PrimaryChromaticity* white_point() const { return white_point_; }
 
-  float luminance_max;
-  float luminance_min;
+  float luminance_max() const { return luminance_max_; }
+  void set_luminance_max(float luminance_max) {
+    luminance_max_ = luminance_max;
+  }
+  float luminance_min() const { return luminance_min_; }
+  void set_luminance_min(float luminance_min) {
+    luminance_min_ = luminance_min;
+  }
 
  private:
   // Returns size of MasteringMetadata child elements.
   uint64_t PayloadSize() const;
 
+  float luminance_max_;
+  float luminance_min_;
   PrimaryChromaticity* r_;
   PrimaryChromaticity* g_;
   PrimaryChromaticity* b_;
@@ -408,26 +439,90 @@ class MasteringMetadata {
 
 class Colour {
  public:
+  enum MatrixCoefficients {
+    kGbr = 0,
+    kBt709 = 1,
+    kUnspecifiedMc = 2,
+    kReserved = 3,
+    kFcc = 4,
+    kBt470bg = 5,
+    kSmpte170MMc = 6,
+    kSmpte240MMc = 7,
+    kYcocg = 8,
+    kBt2020NonConstantLuminance = 9,
+    kBt2020ConstantLuminance = 10,
+  };
+  enum ChromaSitingHorz {
+    kUnspecifiedCsh = 0,
+    kLeftCollocated = 1,
+    kHalfCsh = 2,
+  };
+  enum ChromaSitingVert {
+    kUnspecifiedCsv = 0,
+    kTopCollocated = 1,
+    kHalfCsv = 2,
+  };
+  enum Range {
+    kUnspecifiedCr = 0,
+    kBroadcastRange = 1,
+    kFullRange = 2,
+    kMcTcDefined = 3,  // Defined by MatrixCoefficients/TransferCharacteristics.
+  };
+  enum TransferCharacteristics {
+    kIturBt709Tc = 1,
+    kUnspecifiedTc = 2,
+    kReservedTc = 3,
+    kGamma22Curve = 4,
+    kGamma28Curve = 5,
+    kSmpte170MTc = 6,
+    kSmpte240MTc = 7,
+    kLinear = 8,
+    kLog = 9,
+    kLogSqrt = 10,
+    kIec6196624 = 11,
+    kIturBt1361ExtendedColourGamut = 12,
+    kIec6196621 = 13,
+    kIturBt202010bit = 14,
+    kIturBt202012bit = 15,
+    kSmpteSt2084 = 16,
+    kSmpteSt4281Tc = 17,
+    kAribStdB67Hlg = 18,
+  };
+  enum Primaries {
+    kReservedP0 = 0,
+    kIturBt709P = 1,
+    kUnspecifiedP = 2,
+    kReservedP3 = 3,
+    kIturBt470M = 4,
+    kIturBt470Bg = 5,
+    kSmpte170MP = 6,
+    kSmpte240MP = 7,
+    kFilm = 8,
+    kIturBt2020 = 9,
+    kSmpteSt4281P = 10,
+    kJedecP22Phosphors = 22,
+  };
   static const uint64_t kValueNotPresent;
   Colour()
-      : matrix_coefficients(kValueNotPresent),
-        bits_per_channel(kValueNotPresent),
-        chroma_subsampling_horz(kValueNotPresent),
-        chroma_subsampling_vert(kValueNotPresent),
-        cb_subsampling_horz(kValueNotPresent),
-        cb_subsampling_vert(kValueNotPresent),
-        chroma_siting_horz(kValueNotPresent),
-        chroma_siting_vert(kValueNotPresent),
-        range(kValueNotPresent),
-        transfer_characteristics(kValueNotPresent),
-        primaries(kValueNotPresent),
-        max_cll(kValueNotPresent),
-        max_fall(kValueNotPresent),
+      : matrix_coefficients_(kValueNotPresent),
+        bits_per_channel_(kValueNotPresent),
+        chroma_subsampling_horz_(kValueNotPresent),
+        chroma_subsampling_vert_(kValueNotPresent),
+        cb_subsampling_horz_(kValueNotPresent),
+        cb_subsampling_vert_(kValueNotPresent),
+        chroma_siting_horz_(kValueNotPresent),
+        chroma_siting_vert_(kValueNotPresent),
+        range_(kValueNotPresent),
+        transfer_characteristics_(kValueNotPresent),
+        primaries_(kValueNotPresent),
+        max_cll_(kValueNotPresent),
+        max_fall_(kValueNotPresent),
         mastering_metadata_(NULL) {}
   ~Colour() { delete mastering_metadata_; }
 
   // Returns total size of the Colour element.
   uint64_t ColourSize() const;
+  bool Valid() const;
   bool Write(IMkvWriter* writer) const;
 
   // Deep copies |mastering_metadata|.
@@ -437,27 +532,124 @@ class Colour {
     return mastering_metadata_;
   }
 
-  uint64_t matrix_coefficients;
-  uint64_t bits_per_channel;
-  uint64_t chroma_subsampling_horz;
-  uint64_t chroma_subsampling_vert;
-  uint64_t cb_subsampling_horz;
-  uint64_t cb_subsampling_vert;
-  uint64_t chroma_siting_horz;
-  uint64_t chroma_siting_vert;
-  uint64_t range;
-  uint64_t transfer_characteristics;
-  uint64_t primaries;
-  uint64_t max_cll;
-  uint64_t max_fall;
+  uint64_t matrix_coefficients() const { return matrix_coefficients_; }
+  void set_matrix_coefficients(uint64_t matrix_coefficients) {
+    matrix_coefficients_ = matrix_coefficients;
+  }
+  uint64_t bits_per_channel() const { return bits_per_channel_; }
+  void set_bits_per_channel(uint64_t bits_per_channel) {
+    bits_per_channel_ = bits_per_channel;
+  }
+  uint64_t chroma_subsampling_horz() const { return chroma_subsampling_horz_; }
+  void set_chroma_subsampling_horz(uint64_t chroma_subsampling_horz) {
+    chroma_subsampling_horz_ = chroma_subsampling_horz;
+  }
+  uint64_t chroma_subsampling_vert() const { return chroma_subsampling_vert_; }
+  void set_chroma_subsampling_vert(uint64_t chroma_subsampling_vert) {
+    chroma_subsampling_vert_ = chroma_subsampling_vert;
+  }
+  uint64_t cb_subsampling_horz() const { return cb_subsampling_horz_; }
+  void set_cb_subsampling_horz(uint64_t cb_subsampling_horz) {
+    cb_subsampling_horz_ = cb_subsampling_horz;
+  }
+  uint64_t cb_subsampling_vert() const { return cb_subsampling_vert_; }
+  void set_cb_subsampling_vert(uint64_t cb_subsampling_vert) {
+    cb_subsampling_vert_ = cb_subsampling_vert;
+  }
+  uint64_t chroma_siting_horz() const { return chroma_siting_horz_; }
+  void set_chroma_siting_horz(uint64_t chroma_siting_horz) {
+    chroma_siting_horz_ = chroma_siting_horz;
+  }
+  uint64_t chroma_siting_vert() const { return chroma_siting_vert_; }
+  void set_chroma_siting_vert(uint64_t chroma_siting_vert) {
+    chroma_siting_vert_ = chroma_siting_vert;
+  }
+  uint64_t range() const { return range_; }
+  void set_range(uint64_t range) { range_ = range; }
+  uint64_t transfer_characteristics() const {
+    return transfer_characteristics_;
+  }
+  void set_transfer_characteristics(uint64_t transfer_characteristics) {
+    transfer_characteristics_ = transfer_characteristics;
+  }
+  uint64_t primaries() const { return primaries_; }
+  void set_primaries(uint64_t primaries) { primaries_ = primaries; }
+  uint64_t max_cll() const { return max_cll_; }
+  void set_max_cll(uint64_t max_cll) { max_cll_ = max_cll; }
+  uint64_t max_fall() const { return max_fall_; }
+  void set_max_fall(uint64_t max_fall) { max_fall_ = max_fall; }
 
  private:
   // Returns size of Colour child elements.
   uint64_t PayloadSize() const;
 
+  uint64_t matrix_coefficients_;
+  uint64_t bits_per_channel_;
+  uint64_t chroma_subsampling_horz_;
+  uint64_t chroma_subsampling_vert_;
+  uint64_t cb_subsampling_horz_;
+  uint64_t cb_subsampling_vert_;
+  uint64_t chroma_siting_horz_;
+  uint64_t chroma_siting_vert_;
+  uint64_t range_;
+  uint64_t transfer_characteristics_;
+  uint64_t primaries_;
+  uint64_t max_cll_;
+  uint64_t max_fall_;
+
   MasteringMetadata* mastering_metadata_;
 };
 
+///////////////////////////////////////////////////////////////
+// Projection element.
+class Projection {
+ public:
+  enum ProjectionType {
+    kTypeNotPresent = -1,
+    kRectangular = 0,
+    kEquirectangular = 1,
+    kCubeMap = 2,
+    kMesh = 3,
+  };
+  static const uint64_t kValueNotPresent;
+  Projection()
+      : type_(kRectangular),
+        pose_yaw_(0.0),
+        pose_pitch_(0.0),
+        pose_roll_(0.0),
+        private_data_(NULL),
+        private_data_length_(0) {}
+  ~Projection() { delete[] private_data_; }
+
+  uint64_t ProjectionSize() const;
+  bool Write(IMkvWriter* writer) const;
+
+  bool SetProjectionPrivate(const uint8_t* private_data,
+                            uint64_t private_data_length);
+
+  ProjectionType type() const { return type_; }
+  void set_type(ProjectionType type) { type_ = type; }
+  float pose_yaw() const { return pose_yaw_; }
+  void set_pose_yaw(float pose_yaw) { pose_yaw_ = pose_yaw; }
+  float pose_pitch() const { return pose_pitch_; }
+  void set_pose_pitch(float pose_pitch) { pose_pitch_ = pose_pitch; }
+  float pose_roll() const { return pose_roll_; }
+  void set_pose_roll(float pose_roll) { pose_roll_ = pose_roll; }
+  uint8_t* private_data() const { return private_data_; }
+  uint64_t private_data_length() const { return private_data_length_; }
+
+ private:
+  // Returns size of VideoProjection child elements.
+  uint64_t PayloadSize() const;
+
+  ProjectionType type_;
+  float pose_yaw_;
+  float pose_pitch_;
+  float pose_roll_;
+  uint8_t* private_data_;
+  uint64_t private_data_length_;
+};
+
 ///////////////////////////////////////////////////////////////
 // Track element.
 class Track {
@@ -581,6 +773,10 @@ class VideoTrack : public Track {
   uint64_t display_height() const { return display_height_; }
   void set_display_width(uint64_t width) { display_width_ = width; }
   uint64_t display_width() const { return display_width_; }
+  void set_pixel_height(uint64_t height) { pixel_height_ = height; }
+  uint64_t pixel_height() const { return pixel_height_; }
+  void set_pixel_width(uint64_t width) { pixel_width_ = width; }
+  uint64_t pixel_width() const { return pixel_width_; }
 
   void set_crop_left(uint64_t crop_left) { crop_left_ = crop_left; }
   uint64_t crop_left() const { return crop_left_; }
@@ -605,6 +801,11 @@ class VideoTrack : public Track {
   // Deep copies |colour|.
   bool SetColour(const Colour& colour);
 
+  Projection* projection() { return projection_; }
+
+  // Deep copies |projection|.
+  bool SetProjection(const Projection& projection);
+
  private:
   // Returns the size in bytes of the Video element.
   uint64_t VideoPayloadSize() const;
@@ -612,6 +813,8 @@ class VideoTrack : public Track {
   // Video track element names.
   uint64_t display_height_;
   uint64_t display_width_;
+  uint64_t pixel_height_;
+  uint64_t pixel_width_;
   uint64_t crop_left_;
   uint64_t crop_right_;
   uint64_t crop_top_;
@@ -623,6 +826,7 @@ class VideoTrack : public Track {
   uint64_t width_;
 
   Colour* colour_;
+  Projection* projection_;
 
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(VideoTrack);
 };
@@ -670,6 +874,10 @@ class Tracks {
   static const char kVp8CodecId[];
   static const char kVp9CodecId[];
   static const char kVp10CodecId[];
+  static const char kWebVttCaptionsId[];
+  static const char kWebVttDescriptionsId[];
+  static const char kWebVttMetadataId[];
+  static const char kWebVttSubtitlesId[];
 
   Tracks();
   ~Tracks();
@@ -1294,8 +1502,8 @@ class Segment {
     kBeforeClusters = 0x1  // Position Cues before Clusters
   };
 
-  const static uint32_t kDefaultDocTypeVersion = 2;
-  const static uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
+  static const uint32_t kDefaultDocTypeVersion = 4;
+  static const uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
 
   Segment();
   ~Segment();
@@ -1481,7 +1689,16 @@ class Segment {
   Mode mode() const { return mode_; }
   CuesPosition cues_position() const { return cues_position_; }
   bool output_cues() const { return output_cues_; }
+  void set_estimate_file_duration(bool estimate_duration) {
+    estimate_file_duration_ = estimate_duration;
+  }
+  bool estimate_file_duration() const { return estimate_file_duration_; }
   const SegmentInfo* segment_info() const { return &segment_info_; }
+  void set_duration(double duration) { duration_ = duration; }
+  double duration() const { return duration_; }
+
+  // Returns true when codec IDs are valid for WebM.
+  bool DocTypeIsWebm() const;
 
  private:
   // Checks if header information has been output and initialized. If not it
@@ -1637,6 +1854,9 @@ class Segment {
   // Last timestamp in nanoseconds by track number added to a cluster.
   uint64_t last_track_timestamp_[kMaxTrackNumber];
 
+  // Number of frames written per track.
+  uint64_t track_frames_written_[kMaxTrackNumber];
+
   // Maximum time in nanoseconds for a cluster duration. This variable is a
   // guideline and some clusters may have a longer duration. Default is 30
   // seconds.
@@ -1665,6 +1885,9 @@ class Segment {
   // Flag whether or not to write the Cluster Timecode using exactly 8 bytes.
   bool fixed_size_cluster_timecode_;
 
+  // Flag whether or not to estimate the file duration.
+  bool estimate_file_duration_;
+
   // The size of the EBML header, used to validate the header if
   // WriteEbmlHeader() is called more than once.
   int32_t ebml_header_size_;
@@ -1682,6 +1905,9 @@ class Segment {
   uint32_t doc_type_version_;
   uint32_t doc_type_version_written_;
 
+  // If |duration_| is > 0, then explicitly set the duration of the segment.
+  double duration_;
+
   // Pointer to the writer objects. Not owned by this class.
   IMkvWriter* writer_cluster_;
   IMkvWriter* writer_cues_;
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index 3562b8ab82..355d4e22b3 100644
--- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -10,6 +10,7 @@
 
 #ifdef __ANDROID__
 #include <fcntl.h>
+#include <unistd.h>
 #endif
 
 #include <cassert>
@@ -31,20 +32,20 @@ namespace {
 // Date elements are always 8 octets in size.
 const int kDateElementSize = 8;
 
-uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame,
-                    int64_t timecode, uint64_t timecode_scale) {
-  uint64_t block_additional_elem_size = 0;
-  uint64_t block_addid_elem_size = 0;
-  uint64_t block_more_payload_size = 0;
-  uint64_t block_more_elem_size = 0;
-  uint64_t block_additions_payload_size = 0;
-  uint64_t block_additions_elem_size = 0;
+uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode,
+                  uint64 timecode_scale) {
+  uint64 block_additional_elem_size = 0;
+  uint64 block_addid_elem_size = 0;
+  uint64 block_more_payload_size = 0;
+  uint64 block_more_elem_size = 0;
+  uint64 block_additions_payload_size = 0;
+  uint64 block_additions_elem_size = 0;
   if (frame->additional()) {
     block_additional_elem_size =
         EbmlElementSize(libwebm::kMkvBlockAdditional, frame->additional(),
                         frame->additional_length());
-    block_addid_elem_size =
-        EbmlElementSize(libwebm::kMkvBlockAddID, frame->add_id());
+    block_addid_elem_size = EbmlElementSize(
+        libwebm::kMkvBlockAddID, static_cast<uint64>(frame->add_id()));
 
     block_more_payload_size =
         block_addid_elem_size + block_additional_elem_size;
@@ -58,32 +59,33 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame,
         block_additions_payload_size;
   }
 
-  uint64_t discard_padding_elem_size = 0;
+  uint64 discard_padding_elem_size = 0;
   if (frame->discard_padding() != 0) {
     discard_padding_elem_size =
-        EbmlElementSize(libwebm::kMkvDiscardPadding, frame->discard_padding());
+        EbmlElementSize(libwebm::kMkvDiscardPadding,
+                        static_cast<int64>(frame->discard_padding()));
   }
 
-  const uint64_t reference_block_timestamp =
+  const uint64 reference_block_timestamp =
       frame->reference_block_timestamp() / timecode_scale;
-  uint64_t reference_block_elem_size = 0;
+  uint64 reference_block_elem_size = 0;
   if (!frame->is_key()) {
     reference_block_elem_size =
         EbmlElementSize(libwebm::kMkvReferenceBlock, reference_block_timestamp);
   }
 
-  const uint64_t duration = frame->duration() / timecode_scale;
-  uint64_t block_duration_elem_size = 0;
+  const uint64 duration = frame->duration() / timecode_scale;
+  uint64 block_duration_elem_size = 0;
   if (duration > 0)
     block_duration_elem_size =
         EbmlElementSize(libwebm::kMkvBlockDuration, duration);
 
-  const uint64_t block_payload_size = 4 + frame->length();
-  const uint64_t block_elem_size =
+  const uint64 block_payload_size = 4 + frame->length();
+  const uint64 block_elem_size =
       EbmlMasterElementSize(libwebm::kMkvBlock, block_payload_size) +
       block_payload_size;
 
-  const uint64_t block_group_payload_size =
+  const uint64 block_group_payload_size =
       block_elem_size + block_additions_elem_size + block_duration_elem_size +
       discard_padding_elem_size + reference_block_elem_size;
 
@@ -105,7 +107,7 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame,
   if (SerializeInt(writer, 0, 1))
     return 0;
 
-  if (writer->Write(frame->frame(), static_cast<uint32_t>(frame->length())))
+  if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
     return 0;
 
   if (frame->additional()) {
@@ -118,7 +120,8 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame,
                                 block_more_payload_size))
       return 0;
 
-    if (!WriteEbmlElement(writer, libwebm::kMkvBlockAddID, frame->add_id()))
+    if (!WriteEbmlElement(writer, libwebm::kMkvBlockAddID,
+                          static_cast<uint64>(frame->add_id())))
       return 0;
 
     if (!WriteEbmlElement(writer, libwebm::kMkvBlockAdditional,
@@ -129,7 +132,7 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame,
 
   if (frame->discard_padding() != 0 &&
       !WriteEbmlElement(writer, libwebm::kMkvDiscardPadding,
-                        frame->discard_padding())) {
+                        static_cast<int64>(frame->discard_padding()))) {
     return false;
   }
 
@@ -148,38 +151,38 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame,
          block_group_payload_size;
 }
 
-uint64_t WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame,
-                          int64_t timecode) {
+uint64 WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame,
+                        int64 timecode) {
   if (WriteID(writer, libwebm::kMkvSimpleBlock))
     return 0;
 
-  const int32_t size = static_cast<int32_t>(frame->length()) + 4;
+  const int32 size = static_cast<int32>(frame->length()) + 4;
   if (WriteUInt(writer, size))
     return 0;
 
-  if (WriteUInt(writer, static_cast<uint64_t>(frame->track_number())))
+  if (WriteUInt(writer, static_cast<uint64>(frame->track_number())))
     return 0;
 
   if (SerializeInt(writer, timecode, 2))
     return 0;
 
-  uint64_t flags = 0;
+  uint64 flags = 0;
   if (frame->is_key())
     flags |= 0x80;
 
   if (SerializeInt(writer, flags, 1))
     return 0;
 
-  if (writer->Write(frame->frame(), static_cast<uint32_t>(frame->length())))
+  if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
     return 0;
 
-  return static_cast<uint64_t>(GetUIntSize(libwebm::kMkvSimpleBlock) +
-                               GetCodedUIntSize(size) + 4 + frame->length());
+  return GetUIntSize(libwebm::kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 +
+         frame->length();
 }
 
 }  // namespace
 
-int32_t GetCodedUIntSize(uint64_t value) {
+int32 GetCodedUIntSize(uint64 value) {
   if (value < 0x000000000000007FULL)
     return 1;
   else if (value < 0x0000000000003FFFULL)
@@ -197,7 +200,7 @@ int32_t GetCodedUIntSize(uint64_t value) {
   return 8;
 }
 
-int32_t GetUIntSize(uint64_t value) {
+int32 GetUIntSize(uint64 value) {
   if (value < 0x0000000000000100ULL)
     return 1;
   else if (value < 0x0000000000010000ULL)
@@ -215,26 +218,26 @@ int32_t GetUIntSize(uint64_t value) {
   return 8;
 }
 
-int32_t GetIntSize(int64_t value) {
+int32 GetIntSize(int64 value) {
   // Doubling the requested value ensures positive values with their high bit
   // set are written with 0-padding to avoid flipping the signedness.
-  const uint64_t v = (value < 0) ? value ^ -1LL : value;
+  const uint64 v = (value < 0) ? value ^ -1LL : value;
   return GetUIntSize(2 * v);
 }
 
-uint64_t EbmlMasterElementSize(uint64_t type, uint64_t value) {
+uint64 EbmlMasterElementSize(uint64 type, uint64 value) {
   // Size of EBML ID
-  int32_t ebml_size = GetUIntSize(type);
+  int32 ebml_size = GetUIntSize(type);
 
   // Datasize
   ebml_size += GetCodedUIntSize(value);
 
-  return static_cast<uint64_t>(ebml_size);
+  return ebml_size;
 }
 
-uint64_t EbmlElementSize(uint64_t type, int64_t value) {
+uint64 EbmlElementSize(uint64 type, int64 value) {
   // Size of EBML ID
-  int32_t ebml_size = GetUIntSize(type);
+  int32 ebml_size = GetUIntSize(type);
 
   // Datasize
   ebml_size += GetIntSize(value);
@@ -242,20 +245,19 @@ uint64_t EbmlElementSize(uint64_t type, int64_t value) {
   // Size of Datasize
   ebml_size++;
 
-  return static_cast<uint64_t>(ebml_size);
+  return ebml_size;
 }
 
-uint64_t EbmlElementSize(uint64_t type, uint64_t value) {
+uint64 EbmlElementSize(uint64 type, uint64 value) {
   return EbmlElementSize(type, value, 0);
 }
 
-uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size) {
+uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size) {
   // Size of EBML ID
-  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+  uint64 ebml_size = GetUIntSize(type);
 
   // Datasize
-  ebml_size +=
-      (fixed_size > 0) ? fixed_size : static_cast<uint64_t>(GetUIntSize(value));
+  ebml_size += (fixed_size > 0) ? fixed_size : GetUIntSize(value);
 
   // Size of Datasize
   ebml_size++;
@@ -263,9 +265,9 @@ uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size) {
   return ebml_size;
 }
 
-uint64_t EbmlElementSize(uint64_t type, float /* value */) {
+uint64 EbmlElementSize(uint64 type, float /* value */) {
   // Size of EBML ID
-  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+  uint64 ebml_size = GetUIntSize(type);
 
   // Datasize
   ebml_size += sizeof(float);
@@ -276,28 +278,28 @@ uint64_t EbmlElementSize(uint64_t type, float /* value */) {
   return ebml_size;
 }
 
-uint64_t EbmlElementSize(uint64_t type, const char* value) {
+uint64 EbmlElementSize(uint64 type, const char* value) {
   if (!value)
     return 0;
 
   // Size of EBML ID
-  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+  uint64 ebml_size = GetUIntSize(type);
 
   // Datasize
   ebml_size += strlen(value);
 
   // Size of Datasize
-  ebml_size++;
+  ebml_size += GetCodedUIntSize(strlen(value));
 
   return ebml_size;
 }
 
-uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size) {
+uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size) {
   if (!value)
     return 0;
 
   // Size of EBML ID
-  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+  uint64 ebml_size = GetUIntSize(type);
 
   // Datasize
   ebml_size += size;
@@ -308,9 +310,9 @@ uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size) {
   return ebml_size;
 }
 
-uint64_t EbmlDateElementSize(uint64_t type) {
+uint64 EbmlDateElementSize(uint64 type) {
   // Size of EBML ID
-  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
+  uint64 ebml_size = GetUIntSize(type);
 
   // Datasize
   ebml_size += kDateElementSize;
@@ -321,18 +323,18 @@ uint64_t EbmlDateElementSize(uint64_t type) {
   return ebml_size;
 }
 
-int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size) {
+int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size) {
   if (!writer || size < 1 || size > 8)
     return -1;
 
-  for (int32_t i = 1; i <= size; ++i) {
-    const int32_t byte_count = size - i;
-    const int32_t bit_count = byte_count * 8;
+  for (int32 i = 1; i <= size; ++i) {
+    const int32 byte_count = size - i;
+    const int32 bit_count = byte_count * 8;
 
-    const int64_t bb = value >> bit_count;
-    const uint8_t b = static_cast<uint8_t>(bb);
+    const int64 bb = value >> bit_count;
+    const uint8 b = static_cast<uint8>(bb);
 
-    const int32_t status = writer->Write(&b, 1);
+    const int32 status = writer->Write(&b, 1);
 
     if (status < 0)
       return status;
@@ -341,26 +343,26 @@ int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size) {
   return 0;
 }
 
-int32_t SerializeFloat(IMkvWriter* writer, float f) {
+int32 SerializeFloat(IMkvWriter* writer, float f) {
   if (!writer)
     return -1;
 
-  assert(sizeof(uint32_t) == sizeof(float));
+  assert(sizeof(uint32) == sizeof(float));
   // This union is merely used to avoid a reinterpret_cast from float& to
   // uint32& which will result in violation of strict aliasing.
   union U32 {
-    uint32_t u32;
+    uint32 u32;
     float f;
   } value;
   value.f = f;
 
-  for (int32_t i = 1; i <= 4; ++i) {
-    const int32_t byte_count = 4 - i;
-    const int32_t bit_count = byte_count * 8;
+  for (int32 i = 1; i <= 4; ++i) {
+    const int32 byte_count = 4 - i;
+    const int32 bit_count = byte_count * 8;
 
-    const uint8_t byte = static_cast<uint8_t>(value.u32 >> bit_count);
+    const uint8 byte = static_cast<uint8>(value.u32 >> bit_count);
 
-    const int32_t status = writer->Write(&byte, 1);
+    const int32 status = writer->Write(&byte, 1);
 
     if (status < 0)
       return status;
@@ -369,21 +371,21 @@ int32_t SerializeFloat(IMkvWriter* writer, float f) {
   return 0;
 }
 
-int32_t WriteUInt(IMkvWriter* writer, uint64_t value) {
+int32 WriteUInt(IMkvWriter* writer, uint64 value) {
   if (!writer)
     return -1;
 
-  int32_t size = GetCodedUIntSize(value);
+  int32 size = GetCodedUIntSize(value);
 
   return WriteUIntSize(writer, value, size);
 }
 
-int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size) {
+int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size) {
   if (!writer || size < 0 || size > 8)
     return -1;
 
   if (size > 0) {
-    const uint64_t bit = 1LL << (size * 7);
+    const uint64 bit = 1LL << (size * 7);
 
     if (value > (bit - 2))
       return -1;
@@ -391,11 +393,11 @@ int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size) {
     value |= bit;
   } else {
     size = 1;
-    int64_t bit;
+    int64 bit;
 
     for (;;) {
       bit = 1LL << (size * 7);
-      const uint64_t max = bit - 2;
+      const uint64 max = bit - 2;
 
       if (value <= max)
         break;
@@ -412,18 +414,18 @@ int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size) {
   return SerializeInt(writer, value, size);
 }
 
-int32_t WriteID(IMkvWriter* writer, uint64_t type) {
+int32 WriteID(IMkvWriter* writer, uint64 type) {
   if (!writer)
     return -1;
 
   writer->ElementStartNotify(type, writer->Position());
 
-  const int32_t size = GetUIntSize(type);
+  const int32 size = GetUIntSize(type);
 
   return SerializeInt(writer, type, size);
 }
 
-bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t type, uint64_t size) {
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 type, uint64 size) {
   if (!writer)
     return false;
 
@@ -436,19 +438,19 @@ bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t type, uint64_t size) {
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value) {
   return WriteEbmlElement(writer, type, value, 0);
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value,
-                      uint64_t fixed_size) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value,
+                      uint64 fixed_size) {
   if (!writer)
     return false;
 
   if (WriteID(writer, type))
     return false;
 
-  uint64_t size = static_cast<uint64_t>(GetUIntSize(value));
+  uint64 size = GetUIntSize(value);
   if (fixed_size > 0) {
     if (size > fixed_size)
       return false;
@@ -457,30 +459,30 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value,
   if (WriteUInt(writer, size))
     return false;
 
-  if (SerializeInt(writer, value, static_cast<int32_t>(size)))
+  if (SerializeInt(writer, value, static_cast<int32>(size)))
     return false;
 
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, int64_t value) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value) {
   if (!writer)
     return false;
 
   if (WriteID(writer, type))
     return 0;
 
-  const uint64_t size = GetIntSize(value);
+  const uint64 size = GetIntSize(value);
   if (WriteUInt(writer, size))
     return false;
 
-  if (SerializeInt(writer, value, static_cast<int32_t>(size)))
+  if (SerializeInt(writer, value, static_cast<int32>(size)))
     return false;
 
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value) {
   if (!writer)
     return false;
 
@@ -496,25 +498,25 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value) {
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const char* value) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value) {
   if (!writer || !value)
     return false;
 
   if (WriteID(writer, type))
     return false;
 
-  const uint64_t length = strlen(value);
+  const uint64 length = strlen(value);
   if (WriteUInt(writer, length))
     return false;
 
-  if (writer->Write(value, static_cast<const uint32_t>(length)))
+  if (writer->Write(value, static_cast<const uint32>(length)))
     return false;
 
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value,
-                      uint64_t size) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
+                      uint64 size) {
   if (!writer || !value || size < 1)
     return false;
 
@@ -524,13 +526,13 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value,
   if (WriteUInt(writer, size))
     return false;
 
-  if (writer->Write(value, static_cast<uint32_t>(size)))
+  if (writer->Write(value, static_cast<uint32>(size)))
     return false;
 
   return true;
 }
 
-bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value) {
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value) {
   if (!writer)
     return false;
 
@@ -546,8 +548,8 @@ bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value) {
   return true;
 }
 
-uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame,
-                    Cluster* cluster) {
+uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                  Cluster* cluster) {
   if (!writer || !frame || !frame->IsValid() || !cluster ||
       !cluster->timecode_scale())
     return 0;
@@ -556,7 +558,7 @@ uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame,
   //  timecode for the cluster itself (remember that block timecode
   //  is a signed, 16-bit integer).  However, as a simplification we
   //  only permit non-negative cluster-relative timecodes for blocks.
-  const int64_t relative_timecode = cluster->GetRelativeTimecode(
+  const int64 relative_timecode = cluster->GetRelativeTimecode(
       frame->timestamp() / cluster->timecode_scale());
   if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode)
     return 0;
@@ -567,20 +569,19 @@ uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame,
                         cluster->timecode_scale());
 }
 
-uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size) {
+uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
   if (!writer)
     return false;
 
   // Subtract one for the void ID and the coded size.
-  uint64_t void_entry_size = size - 1 - GetCodedUIntSize(size - 1);
-  uint64_t void_size =
-      EbmlMasterElementSize(libwebm::kMkvVoid, void_entry_size) +
-      void_entry_size;
+  uint64 void_entry_size = size - 1 - GetCodedUIntSize(size - 1);
+  uint64 void_size = EbmlMasterElementSize(libwebm::kMkvVoid, void_entry_size) +
+                     void_entry_size;
 
   if (void_size != size)
     return 0;
 
-  const int64_t payload_position = writer->Position();
+  const int64 payload_position = writer->Position();
   if (payload_position < 0)
     return 0;
 
@@ -590,30 +591,29 @@ uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size) {
   if (WriteUInt(writer, void_entry_size))
     return 0;
 
-  const uint8_t value = 0;
-  for (int32_t i = 0; i < static_cast<int32_t>(void_entry_size); ++i) {
+  const uint8 value = 0;
+  for (int32 i = 0; i < static_cast<int32>(void_entry_size); ++i) {
     if (writer->Write(&value, 1))
       return 0;
   }
 
-  const int64_t stop_position = writer->Position();
+  const int64 stop_position = writer->Position();
   if (stop_position < 0 ||
-      stop_position - payload_position != static_cast<int64_t>(void_size))
+      stop_position - payload_position != static_cast<int64>(void_size))
     return 0;
 
   return void_size;
 }
 
-void GetVersion(int32_t* major, int32_t* minor, int32_t* build,
-                int32_t* revision) {
+void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
   *major = 0;
   *minor = 2;
   *build = 1;
   *revision = 0;
 }
 
-uint64_t MakeUID(unsigned int* seed) {
-  uint64_t uid = 0;
+uint64 MakeUID(unsigned int* seed) {
+  uint64 uid = 0;
 
 #ifdef __MINGW32__
   srand(*seed);
@@ -625,21 +625,22 @@ uint64_t MakeUID(unsigned int* seed) {
 // TODO(fgalligan): Move random number generation to platform specific code.
 #ifdef _MSC_VER
     (void)seed;
-    const int32_t nn = rand();
+    const int32 nn = rand();
 #elif __ANDROID__
-    int32_t temp_num = 1;
+    (void)seed;
+    int32 temp_num = 1;
     int fd = open("/dev/urandom", O_RDONLY);
     if (fd != -1) {
       read(fd, &temp_num, sizeof(temp_num));
       close(fd);
     }
-    const int32_t nn = temp_num;
+    const int32 nn = temp_num;
 #elif defined __MINGW32__
-    const int32_t nn = rand();
+    const int32 nn = rand();
 #else
-    const int32_t nn = rand_r(seed);
+    const int32 nn = rand_r(seed);
 #endif
-    const int32_t n = 0xFF & (nn >> 4);  // throw away low-order bits
+    const int32 n = 0xFF & (nn >> 4);  // throw away low-order bits
 
     uid |= n;
   }
@@ -647,4 +648,97 @@ uint64_t MakeUID(unsigned int* seed) {
   return uid;
 }
 
+bool IsMatrixCoefficientsValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kGbr:
+    case mkvmuxer::Colour::kBt709:
+    case mkvmuxer::Colour::kUnspecifiedMc:
+    case mkvmuxer::Colour::kReserved:
+    case mkvmuxer::Colour::kFcc:
+    case mkvmuxer::Colour::kBt470bg:
+    case mkvmuxer::Colour::kSmpte170MMc:
+    case mkvmuxer::Colour::kSmpte240MMc:
+    case mkvmuxer::Colour::kYcocg:
+    case mkvmuxer::Colour::kBt2020NonConstantLuminance:
+    case mkvmuxer::Colour::kBt2020ConstantLuminance:
+      return true;
+  }
+  return false;
+}
+
+bool IsChromaSitingHorzValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kUnspecifiedCsh:
+    case mkvmuxer::Colour::kLeftCollocated:
+    case mkvmuxer::Colour::kHalfCsh:
+      return true;
+  }
+  return false;
+}
+
+bool IsChromaSitingVertValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kUnspecifiedCsv:
+    case mkvmuxer::Colour::kTopCollocated:
+    case mkvmuxer::Colour::kHalfCsv:
+      return true;
+  }
+  return false;
+}
+
+bool IsColourRangeValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kUnspecifiedCr:
+    case mkvmuxer::Colour::kBroadcastRange:
+    case mkvmuxer::Colour::kFullRange:
+    case mkvmuxer::Colour::kMcTcDefined:
+      return true;
+  }
+  return false;
+}
+
+bool IsTransferCharacteristicsValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kIturBt709Tc:
+    case mkvmuxer::Colour::kUnspecifiedTc:
+    case mkvmuxer::Colour::kReservedTc:
+    case mkvmuxer::Colour::kGamma22Curve:
+    case mkvmuxer::Colour::kGamma28Curve:
+    case mkvmuxer::Colour::kSmpte170MTc:
+    case mkvmuxer::Colour::kSmpte240MTc:
+    case mkvmuxer::Colour::kLinear:
+    case mkvmuxer::Colour::kLog:
+    case mkvmuxer::Colour::kLogSqrt:
+    case mkvmuxer::Colour::kIec6196624:
+    case mkvmuxer::Colour::kIturBt1361ExtendedColourGamut:
+    case mkvmuxer::Colour::kIec6196621:
+    case mkvmuxer::Colour::kIturBt202010bit:
+    case mkvmuxer::Colour::kIturBt202012bit:
+    case mkvmuxer::Colour::kSmpteSt2084:
+    case mkvmuxer::Colour::kSmpteSt4281Tc:
+    case mkvmuxer::Colour::kAribStdB67Hlg:
+      return true;
+  }
+  return false;
+}
+
+bool IsPrimariesValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kReservedP0:
+    case mkvmuxer::Colour::kIturBt709P:
+    case mkvmuxer::Colour::kUnspecifiedP:
+    case mkvmuxer::Colour::kReservedP3:
+    case mkvmuxer::Colour::kIturBt470M:
+    case mkvmuxer::Colour::kIturBt470Bg:
+    case mkvmuxer::Colour::kSmpte170MP:
+    case mkvmuxer::Colour::kSmpte240MP:
+    case mkvmuxer::Colour::kFilm:
+    case mkvmuxer::Colour::kIturBt2020:
+    case mkvmuxer::Colour::kSmpteSt4281P:
+    case mkvmuxer::Colour::kJedecP22Phosphors:
+      return true;
+  }
+  return false;
+}
+
 }  // namespace mkvmuxer
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
index 0e21a2dcbe..132388da59 100644
--- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
+++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
@@ -8,87 +8,104 @@
 #ifndef MKVMUXER_MKVMUXERUTIL_H_
 #define MKVMUXER_MKVMUXERUTIL_H_
 
-#include <stdint.h>
+#include "mkvmuxertypes.h"
+
+#include "stdint.h"
 
 namespace mkvmuxer {
 class Cluster;
 class Frame;
 class IMkvWriter;
 
-const uint64_t kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL;
-const int64_t kMaxBlockTimecode = 0x07FFFLL;
+// TODO(tomfinegan): mkvmuxer:: integer types continue to be used here because
+// changing them causes pain for downstream projects. It would be nice if a
+// solution that allows removal of the mkvmuxer:: integer types while avoiding
+// pain for downstream users of libwebm. Considering that mkvmuxerutil.{cc,h}
+// are really, for the great majority of cases, EBML size calculation and writer
+// functions, perhaps a more EBML focused utility would be the way to go as a
+// first step.
+
+const uint64 kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL;
+const int64 kMaxBlockTimecode = 0x07FFFLL;
 
 // Writes out |value| in Big Endian order. Returns 0 on success.
-int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size);
+int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size);
 
 // Returns the size in bytes of the element.
-int32_t GetUIntSize(uint64_t value);
-int32_t GetIntSize(int64_t value);
-int32_t GetCodedUIntSize(uint64_t value);
-uint64_t EbmlMasterElementSize(uint64_t type, uint64_t value);
-uint64_t EbmlElementSize(uint64_t type, int64_t value);
-uint64_t EbmlElementSize(uint64_t type, uint64_t value);
-uint64_t EbmlElementSize(uint64_t type, float value);
-uint64_t EbmlElementSize(uint64_t type, const char* value);
-uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size);
-uint64_t EbmlDateElementSize(uint64_t type);
+int32 GetUIntSize(uint64 value);
+int32 GetIntSize(int64 value);
+int32 GetCodedUIntSize(uint64 value);
+uint64 EbmlMasterElementSize(uint64 type, uint64 value);
+uint64 EbmlElementSize(uint64 type, int64 value);
+uint64 EbmlElementSize(uint64 type, uint64 value);
+uint64 EbmlElementSize(uint64 type, float value);
+uint64 EbmlElementSize(uint64 type, const char* value);
+uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size);
+uint64 EbmlDateElementSize(uint64 type);
 
 // Returns the size in bytes of the element assuming that the element was
 // written using |fixed_size| bytes. If |fixed_size| is set to zero, then it
 // computes the necessary number of bytes based on |value|.
-uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size);
+uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size);
 
 // Creates an EBML coded number from |value| and writes it out. The size of
 // the coded number is determined by the value of |value|. |value| must not
 // be in a coded form. Returns 0 on success.
-int32_t WriteUInt(IMkvWriter* writer, uint64_t value);
+int32 WriteUInt(IMkvWriter* writer, uint64 value);
 
 // Creates an EBML coded number from |value| and writes it out. The size of
 // the coded number is determined by the value of |size|. |value| must not
 // be in a coded form. Returns 0 on success.
-int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size);
+int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size);
 
 // Output an Mkv master element. Returns true if the element was written.
-bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t value, uint64_t size);
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 value, uint64 size);
 
 // Outputs an Mkv ID, calls |IMkvWriter::ElementStartNotify|, and passes the
 // ID to |SerializeInt|. Returns 0 on success.
-int32_t WriteID(IMkvWriter* writer, uint64_t type);
+int32 WriteID(IMkvWriter* writer, uint64 type);
 
 // Output an Mkv non-master element. Returns true if the element was written.
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, int64_t value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const char* value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value,
-                      uint64_t size);
-bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
+                      uint64 size);
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value);
 
 // Output an Mkv non-master element using fixed size. The element will be
 // written out using exactly |fixed_size| bytes. If |fixed_size| is set to zero
 // then it computes the necessary number of bytes based on |value|. Returns true
 // if the element was written.
-bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value,
-                      uint64_t fixed_size);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value,
+                      uint64 fixed_size);
 
 // Output a Mkv Frame. It decides the correct element to write (Block vs
 // SimpleBlock) based on the parameters of the Frame.
-uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame,
-                    Cluster* cluster);
+uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                  Cluster* cluster);
 
 // Output a void element. |size| must be the entire size in bytes that will be
 // void. The function will calculate the size of the void header and subtract
 // it from |size|.
-uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size);
+uint64 WriteVoidElement(IMkvWriter* writer, uint64 size);
 
 // Returns the version number of the muxer in |major|, |minor|, |build|,
 // and |revision|.
-void GetVersion(int32_t* major, int32_t* minor, int32_t* build,
-                int32_t* revision);
+void GetVersion(int32* major, int32* minor, int32* build, int32* revision);
 
 // Returns a random number to be used for UID, using |seed| to seed
 // the random-number generator (see POSIX rand_r() for semantics).
-uint64_t MakeUID(unsigned int* seed);
+uint64 MakeUID(unsigned int* seed);
+
+// Colour field validation helpers. All return true when |value| is valid.
+bool IsMatrixCoefficientsValueValid(uint64_t value);
+bool IsChromaSitingHorzValueValid(uint64_t value);
+bool IsChromaSitingVertValueValid(uint64_t value);
+bool IsColourRangeValueValid(uint64_t value);
+bool IsTransferCharacteristicsValueValid(uint64_t value);
+bool IsPrimariesValueValid(uint64_t value);
 
 }  // namespace mkvmuxer
 
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
index ca48e149c6..84655d802a 100644
--- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
+++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
@@ -8,6 +8,8 @@
 
 #include "mkvmuxer/mkvwriter.h"
 
+#include <sys/types.h>
+
 #ifdef _MSC_VER
 #include <share.h>  // for _SH_DENYWR
 #endif
@@ -77,7 +79,7 @@ int32 MkvWriter::Position(int64 position) {
 #ifdef _MSC_VER
   return _fseeki64(file_, position, SEEK_SET);
 #else
-  return fseek(file_, position, SEEK_SET);
+  return fseeko(file_, static_cast<off_t>(position), SEEK_SET);
 #endif
 }
 
diff --git a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
index 21801154d9..37f230d0a9 100644
--- a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
@@ -22,9 +22,15 @@
 
 #include "common/webmids.h"
 
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
 namespace mkvparser {
 const float MasteringMetadata::kValueNotPresent = FLT_MAX;
 const long long Colour::kValueNotPresent = LLONG_MAX;
+const float Projection::kValueNotPresent = FLT_MAX;
 
 #ifdef MSC_COMPAT
 inline bool isnan(double val) { return !!_isnan(val); }
@@ -1475,6 +1481,8 @@ long Segment::Load() {
   }
 }
 
+SeekHead::Entry::Entry() : id(0), pos(0), element_start(0), element_size(0) {}
+
 SeekHead::SeekHead(Segment* pSegment, long long start, long long size_,
                    long long element_start, long long element_size)
     : m_pSegment(pSegment),
@@ -1525,15 +1533,19 @@ long SeekHead::Parse() {
   if (pos != stop)
     return E_FILE_FORMAT_INVALID;
 
-  m_entries = new (std::nothrow) Entry[entry_count];
+  if (entry_count > 0) {
+    m_entries = new (std::nothrow) Entry[entry_count];
 
-  if (m_entries == NULL)
-    return -1;
+    if (m_entries == NULL)
+      return -1;
+  }
 
-  m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+  if (void_element_count > 0) {
+    m_void_elements = new (std::nothrow) VoidElement[void_element_count];
 
-  if (m_void_elements == NULL)
-    return -1;
+    if (m_void_elements == NULL)
+      return -1;
+  }
 
   // now parse the entries and void elements
 
@@ -1552,14 +1564,14 @@ long SeekHead::Parse() {
     if (status < 0)  // error
       return status;
 
-    if (id == libwebm::kMkvSeek) {
+    if (id == libwebm::kMkvSeek && entry_count > 0) {
       if (ParseEntry(pReader, pos, size, pEntry)) {
         Entry& e = *pEntry++;
 
         e.element_start = idpos;
         e.element_size = (pos + size) - idpos;
       }
-    } else if (id == libwebm::kMkvVoid) {
+    } else if (id == libwebm::kMkvVoid && void_element_count > 0) {
       VoidElement& e = *pVoidElement++;
 
       e.element_start = idpos;
@@ -1766,18 +1778,7 @@ bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_,
   if ((pos + seekIdSize) > stop)
     return false;
 
-  // Note that the SeekId payload really is serialized
-  // as a "Matroska integer", not as a plain binary value.
-  // In fact, Matroska requires that ID values in the
-  // stream exactly match the binary representation as listed
-  // in the Matroska specification.
-  //
-  // This parser is more liberal, and permits IDs to have
-  // any width.  (This could make the representation in the stream
-  // different from what's in the spec, but it doesn't matter here,
-  // since we always normalize "Matroska integer" values.)
-
-  pEntry->id = ReadUInt(pReader, pos, len);  // payload
+  pEntry->id = ReadID(pReader, pos, len);  // payload
 
   if (pEntry->id <= 0)
     return false;
@@ -2434,7 +2435,9 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
 }
 
 const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const {
-  assert(pTrack);
+  if (pTrack == NULL) {
+    return NULL;
+  }
 
   const long long n = pTrack->GetNumber();
 
@@ -4034,7 +4037,7 @@ long SegmentInfo::Parse() {
   }
 
   const double rollover_check = m_duration * m_timecodeScale;
-  if (rollover_check > LLONG_MAX)
+  if (rollover_check > static_cast<double>(LLONG_MAX))
     return E_FILE_FORMAT_INVALID;
 
   if (pos != stop)
@@ -4125,7 +4128,7 @@ ContentEncoding::~ContentEncoding() {
 }
 
 const ContentEncoding::ContentCompression*
-    ContentEncoding::GetCompressionByIndex(unsigned long idx) const {
+ContentEncoding::GetCompressionByIndex(unsigned long idx) const {
   const ptrdiff_t count = compression_entries_end_ - compression_entries_;
   assert(count >= 0);
 
@@ -4983,29 +4986,27 @@ bool PrimaryChromaticity::Parse(IMkvReader* reader, long long read_pos,
   if (!reader)
     return false;
 
-  std::auto_ptr<PrimaryChromaticity> chromaticity_ptr;
+  if (!*chromaticity)
+    *chromaticity = new PrimaryChromaticity();
 
-  if (!*chromaticity) {
-    chromaticity_ptr.reset(new PrimaryChromaticity());
-  } else {
-    chromaticity_ptr.reset(*chromaticity);
-  }
-
-  if (!chromaticity_ptr.get())
+  if (!*chromaticity)
     return false;
 
-  float* value = is_x ? &chromaticity_ptr->x : &chromaticity_ptr->y;
+  PrimaryChromaticity* pc = *chromaticity;
+  float* value = is_x ? &pc->x : &pc->y;
 
   double parser_value = 0;
-  const long long value_parse_status =
+  const long long parse_status =
       UnserializeFloat(reader, read_pos, value_size, parser_value);
 
+  // Valid range is [0, 1]. Make sure the double is representable as a float
+  // before casting.
+  if (parse_status < 0 || parser_value < 0.0 || parser_value > 1.0 ||
+      (parser_value > 0.0 && parser_value < FLT_MIN))
+    return false;
+
   *value = static_cast<float>(parser_value);
 
-  if (value_parse_status < 0 || *value < 0.0 || *value > 1.0)
-    return false;
-
-  *chromaticity = chromaticity_ptr.release();
   return true;
 }
 
@@ -5188,11 +5189,94 @@ bool Colour::Parse(IMkvReader* reader, long long colour_start,
   return true;
 }
 
+bool Projection::Parse(IMkvReader* reader, long long start, long long size,
+                       Projection** projection) {
+  if (!reader || *projection)
+    return false;
+
+  std::auto_ptr<Projection> projection_ptr(new Projection());
+  if (!projection_ptr.get())
+    return false;
+
+  const long long end = start + size;
+  long long read_pos = start;
+
+  while (read_pos < end) {
+    long long child_id = 0;
+    long long child_size = 0;
+
+    const long long status =
+        ParseElementHeader(reader, read_pos, end, child_id, child_size);
+    if (status < 0)
+      return false;
+
+    if (child_id == libwebm::kMkvProjectionType) {
+      long long projection_type = kTypeNotPresent;
+      projection_type = UnserializeUInt(reader, read_pos, child_size);
+      if (projection_type < 0)
+        return false;
+
+      projection_ptr->type = static_cast<ProjectionType>(projection_type);
+    } else if (child_id == libwebm::kMkvProjectionPrivate) {
+      unsigned char* data = SafeArrayAlloc<unsigned char>(1, child_size);
+
+      if (data == NULL)
+        return false;
+
+      const int status =
+          reader->Read(read_pos, static_cast<long>(child_size), data);
+
+      if (status) {
+        delete[] data;
+        return false;
+      }
+
+      projection_ptr->private_data = data;
+      projection_ptr->private_data_length = static_cast<size_t>(child_size);
+    } else {
+      double value = 0;
+      const long long value_parse_status =
+          UnserializeFloat(reader, read_pos, child_size, value);
+      // Make sure value is representable as a float before casting.
+      if (value_parse_status < 0 || value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
+        return false;
+      }
+
+      switch (child_id) {
+        case libwebm::kMkvProjectionPoseYaw:
+          projection_ptr->pose_yaw = static_cast<float>(value);
+          break;
+        case libwebm::kMkvProjectionPosePitch:
+          projection_ptr->pose_pitch = static_cast<float>(value);
+          break;
+        case libwebm::kMkvProjectionPoseRoll:
+          projection_ptr->pose_roll = static_cast<float>(value);
+          break;
+        default:
+          return false;
+      }
+    }
+
+    read_pos += child_size;
+    if (read_pos > end)
+      return false;
+  }
+
+  *projection = projection_ptr.release();
+  return true;
+}
+
 VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
                        long long element_size)
-    : Track(pSegment, element_start, element_size), m_colour(NULL) {}
+    : Track(pSegment, element_start, element_size),
+      m_colour(NULL),
+      m_projection(NULL) {}
 
-VideoTrack::~VideoTrack() { delete m_colour; }
+VideoTrack::~VideoTrack() {
+  delete m_colour;
+  delete m_projection;
+}
 
 long VideoTrack::Parse(Segment* pSegment, const Info& info,
                        long long element_start, long long element_size,
@@ -5224,6 +5308,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   const long long stop = pos + s.size;
 
   Colour* colour = NULL;
+  Projection* projection = NULL;
 
   while (pos < stop) {
     long long id, size;
@@ -5274,6 +5359,9 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
     } else if (id == libwebm::kMkvColour) {
       if (!Colour::Parse(pReader, pos, size, &colour))
         return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvProjection) {
+      if (!Projection::Parse(pReader, pos, size, &projection))
+        return E_FILE_FORMAT_INVALID;
     }
 
     pos += size;  // consume payload
@@ -5305,6 +5393,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   pTrack->m_stereo_mode = stereo_mode;
   pTrack->m_rate = rate;
   pTrack->m_colour = colour;
+  pTrack->m_projection = projection;
 
   pResult = pTrack;
   return 0;  // success
@@ -5405,6 +5494,8 @@ long VideoTrack::Seek(long long time_ns, const BlockEntry*& pResult) const {
 
 Colour* VideoTrack::GetColour() const { return m_colour; }
 
+Projection* VideoTrack::GetProjection() const { return m_projection; }
+
 long long VideoTrack::GetWidth() const { return m_width; }
 
 long long VideoTrack::GetHeight() const { return m_height; }
@@ -6698,8 +6789,10 @@ Cluster::Cluster(Segment* pSegment, long idx, long long element_start
 {}
 
 Cluster::~Cluster() {
-  if (m_entries_count <= 0)
+  if (m_entries_count <= 0) {
+    delete[] m_entries;
     return;
+  }
 
   BlockEntry** i = m_entries;
   BlockEntry** const j = m_entries + m_entries_count;
@@ -7850,7 +7943,6 @@ long Block::Parse(const Cluster* pCluster) {
     pf = m_frames;
     while (pf != pf_end) {
       Frame& f = *pf++;
-      assert((pos + f.len) <= stop);
       if ((pos + f.len) > stop)
         return E_FILE_FORMAT_INVALID;
 
diff --git a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h
index 42e6e88ab4..26c2b7e5eb 100644
--- a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h
+++ b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h
@@ -473,6 +473,34 @@ struct Colour {
   MasteringMetadata* mastering_metadata;
 };
 
+struct Projection {
+  enum ProjectionType {
+    kTypeNotPresent = -1,
+    kRectangular = 0,
+    kEquirectangular = 1,
+    kCubeMap = 2,
+    kMesh = 3,
+  };
+  static const float kValueNotPresent;
+  Projection()
+      : type(kTypeNotPresent),
+        private_data(NULL),
+        private_data_length(0),
+        pose_yaw(kValueNotPresent),
+        pose_pitch(kValueNotPresent),
+        pose_roll(kValueNotPresent) {}
+  ~Projection() { delete[] private_data; }
+  static bool Parse(IMkvReader* reader, long long element_start,
+                    long long element_size, Projection** projection);
+
+  ProjectionType type;
+  unsigned char* private_data;
+  size_t private_data_length;
+  float pose_yaw;
+  float pose_pitch;
+  float pose_roll;
+};
+
 class VideoTrack : public Track {
   VideoTrack(const VideoTrack&);
   VideoTrack& operator=(const VideoTrack&);
@@ -497,6 +525,8 @@ class VideoTrack : public Track {
 
   Colour* GetColour() const;
 
+  Projection* GetProjection() const;
+
  private:
   long long m_width;
   long long m_height;
@@ -508,6 +538,7 @@ class VideoTrack : public Track {
   double m_rate;
 
   Colour* m_colour;
+  Projection* m_projection;
 };
 
 class AudioTrack : public Track {
@@ -813,6 +844,8 @@ class SeekHead {
   long Parse();
 
   struct Entry {
+    Entry();
+
     // the SeekHead entry payload
     long long id;
     long long pos;
diff --git a/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc b/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
index 9f90d8c4f8..23d68f5089 100644
--- a/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
+++ b/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
@@ -7,6 +7,8 @@
 // be found in the AUTHORS file in the root of the source tree.
 #include "mkvparser/mkvreader.h"
 
+#include <sys/types.h>
+
 #include <cassert>
 
 namespace mkvparser {
@@ -117,7 +119,7 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
   if (status)
     return -1;  // error
 #else
-  fseek(m_file, offset, SEEK_SET);
+  fseeko(m_file, static_cast<off_t>(offset), SEEK_SET);
 #endif
 
   const size_t size = fread(buffer, 1, len, m_file);
@@ -128,4 +130,4 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
   return 0;  // success
 }
 
-}  // namespace mkvparser
\ No newline at end of file
+}  // namespace mkvparser
diff --git a/libs/libvpx/tools.mk b/libs/libvpx/tools.mk
new file mode 100644
index 0000000000..1d005b2acf
--- /dev/null
+++ b/libs/libvpx/tools.mk
@@ -0,0 +1,115 @@
+##
+##  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+# List of tools to build.
+TOOLS-yes            += tiny_ssim.c
+tiny_ssim.SRCS       += vpx/vpx_integer.h y4minput.c y4minput.h \
+                        vpx/vpx_codec.h vpx/src/vpx_image.c
+tiny_ssim.SRCS       += vpx_mem/vpx_mem.c vpx_mem/vpx_mem.h
+tiny_ssim.SRCS       += vpx_dsp/ssim.h vpx_scale/yv12config.h
+tiny_ssim.SRCS       += vpx_ports/mem.h vpx_ports/mem.h
+tiny_ssim.SRCS       += vpx_mem/include/vpx_mem_intrnl.h
+tiny_ssim.GUID        = 3afa9b05-940b-4d68-b5aa-55157d8ed7b4
+tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files
+
+#
+# End of specified files. The rest of the build rules should happen
+# automagically from here.
+#
+
+
+# Expand list of selected tools to build (as specified above)
+TOOLS           = $(addprefix tools/,$(call enabled,TOOLS))
+ALL_SRCS        = $(foreach ex,$(TOOLS),$($(notdir $(ex:.c=)).SRCS))
+CFLAGS += -I../include
+
+ifneq ($(CONFIG_CODEC_SRCS), yes)
+  CFLAGS += -I../include/vpx
+endif
+
+# Expand all tools sources into a variable containing all sources
+# for that tools (not just them main one specified in TOOLS)
+# and add this file to the list (for MSVS workspace generation)
+$(foreach ex,$(TOOLS),$(eval $(notdir $(ex:.c=)).SRCS += $(ex) tools.mk))
+
+
+# Create build/install dependencies for all tools. The common case
+# is handled here. The MSVS case is handled below.
+NOT_MSVS = $(if $(CONFIG_MSVS),,yes)
+DIST-BINS-$(NOT_MSVS)      += $(addprefix bin/,$(TOOLS:.c=$(EXE_SFX)))
+DIST-SRCS-yes              += $(ALL_SRCS)
+OBJS-$(NOT_MSVS)           += $(call objs,$(ALL_SRCS))
+BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(TOOLS:.c=$(EXE_SFX)))
+
+# Instantiate linker template for all tools.
+$(foreach bin,$(BINS-yes),\
+    $(eval $(bin):)\
+    $(eval $(call linker_template,$(bin),\
+        $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) -lm)))
+
+# The following pairs define a mapping of locations in the distribution
+# tree to locations in the source/build trees.
+INSTALL_MAPS += src/%.c   %.c
+INSTALL_MAPS += src/%     $(SRC_PATH_BARE)/%
+INSTALL_MAPS += bin/%     %
+INSTALL_MAPS += %         %
+
+
+# Build Visual Studio Projects. We use a template here to instantiate
+# explicit rules rather than using an implicit rule because we want to
+# leverage make's VPATH searching rather than specifying the paths on
+# each file in TOOLS. This has the unfortunate side effect that
+# touching the source files trigger a rebuild of the project files
+# even though there is no real dependency there (the dependency is on
+# the makefiles). We may want to revisit this.
+define vcproj_template
+$(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX)
+	$(if $(quiet),@echo "    [vcproj] $$@")
+	$(qexec)$$(GEN_VCPROJ)\
+            --exe\
+            --target=$$(TOOLCHAIN)\
+            --name=$$(@:.$(VCPROJ_SFX)=)\
+            --ver=$$(CONFIG_VS_VERSION)\
+            --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
+            --src-path-bare="$(SRC_PATH_BARE)" \
+            $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
+            --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
+            $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^
+endef
+TOOLS_BASENAME := $(notdir $(TOOLS))
+PROJECTS-$(CONFIG_MSVS) += $(TOOLS_BASENAME:.c=.$(VCPROJ_SFX))
+INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\
+                               $(addprefix bin/$(p)/,$(TOOLS_BASENAME:.c=.exe)))
+$(foreach proj,$(call enabled,PROJECTS),\
+    $(eval $(call vcproj_template,$(proj))))
+
+#
+# Documentation Rules
+#
+%.dox: %.c
+	@echo "    [DOXY] $@"
+	@mkdir -p $(dir $@)
+	@echo "/*!\page tools_$(@F:.dox=) $(@F:.dox=)" > $@
+	@echo "   \includelineno $(<F)" >> $@
+	@echo "*/" >> $@
+
+tools.dox: tools.mk
+	@echo "    [DOXY] $@"
+	@echo "/*!\page tools Tools" > $@
+	@echo "    This SDK includes a number of tools/utilities."\
+	      "The following tools are included: ">>$@
+	@$(foreach ex,$(sort $(notdir $(TOOLS:.c=))),\
+	   echo "     - \subpage tools_$(ex) $($(ex).DESCRIPTION)" >> $@;)
+	@echo "*/" >> $@
+
+CLEAN-OBJS += tools.doxy tools.dox $(TOOLS:.c=.dox)
+DOCS-yes += tools.doxy tools.dox
+tools.doxy: tools.dox $(TOOLS:.c=.dox)
+	@echo "INPUT += $^" > $@
diff --git a/libs/libvpx/tools/all_builds.py b/libs/libvpx/tools/all_builds.py
deleted file mode 100755
index d1f0c80c03..0000000000
--- a/libs/libvpx/tools/all_builds.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/python
-
-import getopt
-import subprocess
-import sys
-
-LONG_OPTIONS = ["shard=", "shards="]
-BASE_COMMAND = "./configure --enable-internal-stats --enable-experimental"
-
-def RunCommand(command):
-  run = subprocess.Popen(command, shell=True)
-  output = run.communicate()
-  if run.returncode:
-    print "Non-zero return code: " + str(run.returncode) + " => exiting!"
-    sys.exit(1)
-
-def list_of_experiments():
-  experiments = []
-  configure_file = open("configure")
-  list_start = False
-  for line in configure_file.read().split("\n"):
-    if line == 'EXPERIMENT_LIST="':
-      list_start = True
-    elif line == '"':
-      list_start = False
-    elif list_start:
-      currently_broken = ["csm"]
-      experiment = line[4:]
-      if experiment not in currently_broken:
-        experiments.append(experiment)
-  return experiments
-
-def main(argv):
-  # Parse arguments
-  options = {"--shard": 0, "--shards": 1}
-  if "--" in argv:
-    opt_end_index = argv.index("--")
-  else:
-    opt_end_index = len(argv)
-  try:
-    o, _ = getopt.getopt(argv[1:opt_end_index], None, LONG_OPTIONS)
-  except getopt.GetoptError, err:
-    print str(err)
-    print "Usage: %s [--shard=<n> --shards=<n>] -- [configure flag ...]"%argv[0]
-    sys.exit(2)
-
-  options.update(o)
-  extra_args = argv[opt_end_index + 1:]
-
-  # Shard experiment list
-  shard = int(options["--shard"])
-  shards = int(options["--shards"])
-  experiments = list_of_experiments()
-  base_command = " ".join([BASE_COMMAND] + extra_args)
-  configs = [base_command]
-  configs += ["%s --enable-%s" % (base_command, e) for e in experiments]
-  my_configs = zip(configs, range(len(configs)))
-  my_configs = filter(lambda x: x[1] % shards == shard, my_configs)
-  my_configs = [e[0] for e in my_configs]
-
-  # Run configs for this shard
-  for config in my_configs:
-    test_build(config)
-
-def test_build(configure_command):
-  print "\033[34m\033[47mTesting %s\033[0m" % (configure_command)
-  RunCommand(configure_command)
-  RunCommand("make clean")
-  RunCommand("make")
-
-if __name__ == "__main__":
-  main(sys.argv)
diff --git a/libs/libvpx/tools/author_first_release.sh b/libs/libvpx/tools/author_first_release.sh
deleted file mode 100755
index 7b0b797212..0000000000
--- a/libs/libvpx/tools/author_first_release.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-##
-## List the release each author first contributed to.
-##
-## Usage: author_first_release.sh [TAGS]
-##
-## If the TAGS arguments are unspecified, all tags reported by `git tag`
-## will be considered.
-##
-tags=${@:-$(git tag)}
-for tag in $tags; do
-  git shortlog -n -e -s $tag |
-      cut -f2- |
-      awk "{print \"${tag#v}\t\"\$0}"
-done | sort -k2  | uniq -f2
diff --git a/libs/libvpx/tools/ftfy.sh b/libs/libvpx/tools/ftfy.sh
deleted file mode 100755
index c005918fe7..0000000000
--- a/libs/libvpx/tools/ftfy.sh
+++ /dev/null
@@ -1,158 +0,0 @@
-#!/bin/sh
-self="$0"
-dirname_self=$(dirname "$self")
-
-usage() {
-  cat <<EOF >&2
-Usage: $self [option]
-
-This script applies a whitespace transformation to the commit at HEAD. If no
-options are given, then the modified files are left in the working tree.
-
-Options:
-  -h, --help     Shows this message
-  -n, --dry-run  Shows a diff of the changes to be made.
-  --amend        Squashes the changes into the commit at HEAD
-                     This option will also reformat the commit message.
-  --commit       Creates a new commit containing only the whitespace changes
-  --msg-only     Reformat the commit message only, ignore the patch itself.
-
-EOF
-  rm -f ${CLEAN_FILES}
-  exit 1
-}
-
-
-log() {
-  echo "${self##*/}: $@" >&2
-}
-
-
-vpx_style() {
-  for f; do
-    case "$f" in
-      *.h|*.c|*.cc)
-        clang-format -i --style=file "$f"
-        ;;
-    esac
-  done
-}
-
-
-apply() {
-  [ $INTERSECT_RESULT -ne 0 ] && patch -p1 < "$1"
-}
-
-
-commit() {
-  LAST_CHANGEID=$(git show | awk '/Change-Id:/{print $2}')
-  if [ -z "$LAST_CHANGEID" ]; then
-    log "HEAD doesn't have a Change-Id, unable to generate a new commit"
-    exit 1
-  fi
-
-  # Build a deterministic Change-Id from the parent's
-  NEW_CHANGEID=${LAST_CHANGEID}-styled
-  NEW_CHANGEID=I$(echo $NEW_CHANGEID | git hash-object --stdin)
-
-  # Commit, preserving authorship from the parent commit.
-  git commit -a -C HEAD > /dev/null
-  git commit --amend -F- << EOF
-Cosmetic: Fix whitespace in change ${LAST_CHANGEID:0:9}
-
-Change-Id: ${NEW_CHANGEID}
-EOF
-}
-
-
-show_commit_msg_diff() {
-  if [ $DIFF_MSG_RESULT -ne 0 ]; then
-    log "Modified commit message:"
-    diff -u "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" | tail -n +3
-  fi
-}
-
-
-amend() {
-  show_commit_msg_diff
-  if [ $DIFF_MSG_RESULT -ne 0 ] || [ $INTERSECT_RESULT -ne 0 ]; then
-    git commit -a --amend -F "$NEW_COMMIT_MSG"
-  fi
-}
-
-
-diff_msg() {
-  git log -1 --format=%B > "$ORIG_COMMIT_MSG"
-  "${dirname_self}"/wrap-commit-msg.py \
-      < "$ORIG_COMMIT_MSG" > "$NEW_COMMIT_MSG"
-  cmp -s "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG"
-  DIFF_MSG_RESULT=$?
-}
-
-
-# Temporary files
-ORIG_DIFF=orig.diff.$$
-MODIFIED_DIFF=modified.diff.$$
-FINAL_DIFF=final.diff.$$
-ORIG_COMMIT_MSG=orig.commit-msg.$$
-NEW_COMMIT_MSG=new.commit-msg.$$
-CLEAN_FILES="${ORIG_DIFF} ${MODIFIED_DIFF} ${FINAL_DIFF}"
-CLEAN_FILES="${CLEAN_FILES} ${ORIG_COMMIT_MSG} ${NEW_COMMIT_MSG}"
-
-# Preconditions
-[ $# -lt 2 ] || usage
-
-if ! clang-format -version >/dev/null 2>&1; then
-  log "clang-format not found"
-  exit 1
-fi
-
-if ! git diff --quiet HEAD; then
-  log "Working tree is dirty, commit your changes first"
-  exit 1
-fi
-
-# Need to be in the root
-cd "$(git rev-parse --show-toplevel)"
-
-# Collect the original diff
-git show > "${ORIG_DIFF}"
-
-# Apply the style guide on new and modified files and collect its diff
-for f in $(git diff HEAD^ --name-only -M90 --diff-filter=AM); do
-  case "$f" in
-    third_party/*) continue;;
-  esac
-  vpx_style "$f"
-done
-git diff --no-color --no-ext-diff > "${MODIFIED_DIFF}"
-
-# Intersect the two diffs
-"${dirname_self}"/intersect-diffs.py \
-    "${ORIG_DIFF}" "${MODIFIED_DIFF}" > "${FINAL_DIFF}"
-INTERSECT_RESULT=$?
-git reset --hard >/dev/null
-
-# Fixup the commit message
-diff_msg
-
-# Handle options
-if [ -n "$1" ]; then
-  case "$1" in
-    -h|--help) usage;;
-    -n|--dry-run) cat "${FINAL_DIFF}"; show_commit_msg_diff;;
-    --commit) apply "${FINAL_DIFF}"; commit;;
-    --amend) apply "${FINAL_DIFF}"; amend;;
-    --msg-only) amend;;
-    *) usage;;
-  esac
-else
-  apply "${FINAL_DIFF}"
-  if ! git diff --quiet; then
-    log "Formatting changes applied, verify and commit."
-    log "See also: http://www.webmproject.org/code/contribute/conventions/"
-    git diff --stat
-  fi
-fi
-
-rm -f ${CLEAN_FILES}
diff --git a/libs/libvpx/tools/gen_authors.sh b/libs/libvpx/tools/gen_authors.sh
index 4cfd81ec3f..f163f663af 100755
--- a/libs/libvpx/tools/gen_authors.sh
+++ b/libs/libvpx/tools/gen_authors.sh
@@ -6,7 +6,8 @@ cat <<EOF
 # This file is automatically generated from the git commit history
 # by tools/gen_authors.sh.
 
-$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v corp.google)
+$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v corp.google \
+  | grep -v noreply)
 Google Inc.
 The Mozilla Foundation
 The Xiph.Org Foundation
diff --git a/libs/libvpx/tools/tiny_ssim.c b/libs/libvpx/tools/tiny_ssim.c
new file mode 100644
index 0000000000..5e8ca02b49
--- /dev/null
+++ b/libs/libvpx/tools/tiny_ssim.c
@@ -0,0 +1,802 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "./y4minput.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_ports/mem.h"
+
+static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2
+static const int64_t cc1_10 = 428658;    // (64^2*(.01*1023)^2
+static const int64_t cc2_10 = 3857925;   // (64^2*(.03*1023)^2
+static const int64_t cc1_12 = 6868593;   // (64^2*(.01*4095)^2
+static const int64_t cc2_12 = 61817334;  // (64^2*(.03*4095)^2
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static uint64_t calc_plane_error16(uint16_t *orig, int orig_stride,
+                                   uint16_t *recon, int recon_stride,
+                                   unsigned int cols, unsigned int rows) {
+  unsigned int row, col;
+  uint64_t total_sse = 0;
+  int diff;
+
+  for (row = 0; row < rows; row++) {
+    for (col = 0; col < cols; col++) {
+      diff = orig[col] - recon[col];
+      total_sse += diff * diff;
+    }
+
+    orig += orig_stride;
+    recon += recon_stride;
+  }
+  return total_sse;
+}
+#endif
+static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon,
+                                 int recon_stride, unsigned int cols,
+                                 unsigned int rows) {
+  unsigned int row, col;
+  uint64_t total_sse = 0;
+  int diff;
+
+  for (row = 0; row < rows; row++) {
+    for (col = 0; col < cols; col++) {
+      diff = orig[col] - recon[col];
+      total_sse += diff * diff;
+    }
+
+    orig += orig_stride;
+    recon += recon_stride;
+  }
+  return total_sse;
+}
+
+#define MAX_PSNR 100
+static double mse2psnr(double samples, double peak, double mse) {
+  double psnr;
+
+  if (mse > 0.0)
+    psnr = 10.0 * log10(peak * peak * samples / mse);
+  else
+    psnr = MAX_PSNR;  // Limit to prevent / 0
+
+  if (psnr > MAX_PSNR) psnr = MAX_PSNR;
+
+  return psnr;
+}
+
+typedef enum { RAW_YUV, Y4M } input_file_type;
+
+typedef struct input_file {
+  FILE *file;
+  input_file_type type;
+  unsigned char *buf;
+  y4m_input y4m;
+  vpx_image_t img;
+  int w;
+  int h;
+  int bit_depth;
+} input_file_t;
+
+// Open a file and determine if its y4m or raw.  If y4m get the header.
+static int open_input_file(const char *file_name, input_file_t *input, int w,
+                           int h, int bit_depth) {
+  char y4m_buf[4];
+  size_t r1;
+  input->type = RAW_YUV;
+  input->buf = NULL;
+  input->file = strcmp(file_name, "-") ? fopen(file_name, "rb") : stdin;
+  if (input->file == NULL) return -1;
+  r1 = fread(y4m_buf, 1, 4, input->file);
+  if (r1 == 4) {
+    if (memcmp(y4m_buf, "YUV4", 4) == 0) input->type = Y4M;
+    switch (input->type) {
+      case Y4M:
+        y4m_input_open(&input->y4m, input->file, y4m_buf, 4, 0);
+        input->w = input->y4m.pic_w;
+        input->h = input->y4m.pic_h;
+        input->bit_depth = input->y4m.bit_depth;
+        // Y4M alloc's its own buf. Init this to avoid problems if we never
+        // read frames.
+        memset(&input->img, 0, sizeof(input->img));
+        break;
+      case RAW_YUV:
+        fseek(input->file, 0, SEEK_SET);
+        input->w = w;
+        input->h = h;
+        if (bit_depth < 9)
+          input->buf = malloc(w * h * 3 / 2);
+        else
+          input->buf = malloc(w * h * 3);
+        break;
+    }
+  }
+  return 0;
+}
+
+static void close_input_file(input_file_t *in) {
+  if (in->file) fclose(in->file);
+  if (in->type == Y4M) {
+    vpx_img_free(&in->img);
+  } else {
+    free(in->buf);
+  }
+}
+
+static size_t read_input_file(input_file_t *in, unsigned char **y,
+                              unsigned char **u, unsigned char **v, int bd) {
+  size_t r1 = 0;
+  switch (in->type) {
+    case Y4M:
+      r1 = y4m_input_fetch_frame(&in->y4m, in->file, &in->img);
+      *y = in->img.planes[0];
+      *u = in->img.planes[1];
+      *v = in->img.planes[2];
+      break;
+    case RAW_YUV:
+      if (bd < 9) {
+        r1 = fread(in->buf, in->w * in->h * 3 / 2, 1, in->file);
+        *y = in->buf;
+        *u = in->buf + in->w * in->h;
+        *v = in->buf + 5 * in->w * in->h / 4;
+      } else {
+        r1 = fread(in->buf, in->w * in->h * 3, 1, in->file);
+        *y = in->buf;
+        *u = in->buf + in->w * in->h / 2;
+        *v = *u + in->w * in->h / 2;
+      }
+      break;
+  }
+
+  return r1;
+}
+
+void ssim_parms_16x16(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                      uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                      uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 16; i++, s += sp, r += rp) {
+    for (j = 0; j < 16; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+void ssim_parms_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                    uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                    uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+void highbd_ssim_parms_8x8(const uint16_t *s, int sp, const uint16_t *r, int rp,
+                           uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                           uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
+                         uint32_t sum_sq_r, uint32_t sum_sxr, int count,
+                         uint32_t bd) {
+  int64_t ssim_n, ssim_d;
+  int64_t c1 = 0, c2 = 0;
+  if (bd == 8) {
+    // scale the constants by number of pixels
+    c1 = (cc1 * count * count) >> 12;
+    c2 = (cc2 * count * count) >> 12;
+  } else if (bd == 10) {
+    c1 = (cc1_10 * count * count) >> 12;
+    c2 = (cc2_10 * count * count) >> 12;
+  } else if (bd == 12) {
+    c1 = (cc1_12 * count * count) >> 12;
+    c2 = (cc2_12 * count * count) >> 12;
+  } else {
+    assert(0);
+  }
+
+  ssim_n = (2 * sum_s * sum_r + c1) *
+           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+
+  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
+           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
+            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+
+  return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
+}
+
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+                              int rp, uint32_t bd, uint32_t shift) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                        &sum_sxr);
+  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+static double ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+                    int stride_img2, int width, int height) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                           int stride_img1, int stride_img2, int width,
+                           int height, uint32_t bd, uint32_t shift) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+                                 shift);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
+//
+// Re working out the math ->
+//
+// ssim(x,y) =  (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
+//   ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
+//
+// mean(x) = sum(x) / n
+//
+// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
+//
+// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
+//    ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
+//     (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
+//
+// factoring out n*n
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
+//    (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
+//
+// Replace c1 with n*n * c1 for the final step that leads to this code:
+// The final step scales by 12 bits so we don't lose precision in the constants.
+
+static double ssimv_similarity(const Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
+                   (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+
+  // Since these variables are unsigned sums, convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+
+// The first term of the ssim metric is a luminance factor.
+//
+// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
+//
+// This luminance factor is super sensitive to the dark side of luminance
+// values and completely insensitive on the white side.  check out 2 sets
+// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
+// 2*250*252/ (250^2+252^2) => .99999997
+//
+// As a result in this tweaked version of the calculation in which the
+// luminance is taken as percentage off from peak possible.
+//
+// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
+//
+static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
+  const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
+
+  // Since these variables are unsigned, sums convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                        int img2_pitch, Ssimv *sv) {
+  ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
+                 &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
+}
+
+double get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                        int img2_pitch, int width, int height, Ssimv *sv2,
+                        Metrics *m, int do_inconsistency) {
+  double dssim_total = 0;
+  double ssim_total = 0;
+  double ssim2_total = 0;
+  double inconsistency_total = 0;
+  int i, j;
+  int c = 0;
+  double norm;
+  double old_ssim_total = 0;
+
+  // We can sample points as frequently as we like start with 1 per 4x4.
+  for (i = 0; i < height;
+       i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4, ++c) {
+      Ssimv sv = { 0, 0, 0, 0, 0, 0 };
+      double ssim;
+      double ssim2;
+      double dssim;
+      uint32_t var_new;
+      uint32_t var_old;
+      uint32_t mean_new;
+      uint32_t mean_old;
+      double ssim_new;
+      double ssim_old;
+
+      // Not sure there's a great way to handle the edge pixels
+      // in ssim when using a window. Seems biased against edge pixels
+      // however you handle this. This uses only samples that are
+      // fully in the frame.
+      if (j + 8 <= width && i + 8 <= height) {
+        ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
+      }
+
+      ssim = ssimv_similarity(&sv, 64);
+      ssim2 = ssimv_similarity2(&sv, 64);
+
+      sv.ssim = ssim2;
+
+      // dssim is calculated to use as an actual error metric and
+      // is scaled up to the same range as sum square error.
+      // Since we are subsampling every 16th point maybe this should be
+      // *16 ?
+      dssim = 255 * 255 * (1 - ssim2) / 2;
+
+      // Here I introduce a new error metric: consistency-weighted
+      // SSIM-inconsistency.  This metric isolates frames where the
+      // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
+      // sharper or blurrier than the others. Higher values indicate a
+      // temporally inconsistent SSIM. There are two ideas at work:
+      //
+      // 1) 'SSIM-inconsistency': the total inconsistency value
+      // reflects how much SSIM values are changing between this
+      // source / reference frame pair and the previous pair.
+      //
+      // 2) 'consistency-weighted': weights de-emphasize areas in the
+      // frame where the scene content has changed. Changes in scene
+      // content are detected via changes in local variance and local
+      // mean.
+      //
+      // Thus the overall measure reflects how inconsistent the SSIM
+      // values are, over consistent regions of the frame.
+      //
+      // The metric has three terms:
+      //
+      // term 1 -> uses change in scene Variance to weight error score
+      //  2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term 2 -> uses change in local scene luminance to weight error
+      //  2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term3 -> measures inconsistency in ssim scores between frames
+      //   1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
+      //
+      // This term compares the ssim score for the same location in 2
+      // subsequent frames.
+      var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
+      var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
+      mean_new = sv.sum_s;
+      mean_old = sv2[c].sum_s;
+      ssim_new = sv.ssim;
+      ssim_old = sv2[c].ssim;
+
+      if (do_inconsistency) {
+        // We do the metric once for every 4x4 block in the image. Since
+        // we are scaling the error to SSE for use in a psnr calculation
+        // 1.0 = 4x4x255x255 the worst error we can possibly have.
+        static const double kScaling = 4. * 4 * 255 * 255;
+
+        // The constants have to be non 0 to avoid potential divide by 0
+        // issues other than that they affect kind of a weighting between
+        // the terms.  No testing of what the right terms should be has been
+        // done.
+        static const double c1 = 1, c2 = 1, c3 = 1;
+
+        // This measures how much consistent variance is in two consecutive
+        // source frames. 1.0 means they have exactly the same variance.
+        const double variance_term =
+            (2.0 * var_old * var_new + c1) /
+            (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
+
+        // This measures how consistent the local mean are between two
+        // consecutive frames. 1.0 means they have exactly the same mean.
+        const double mean_term =
+            (2.0 * mean_old * mean_new + c2) /
+            (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
+
+        // This measures how consistent the ssims of two
+        // consecutive frames is. 1.0 means they are exactly the same.
+        double ssim_term =
+            pow((2.0 * ssim_old * ssim_new + c3) /
+                    (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+                5);
+
+        double this_inconsistency;
+
+        // Floating point math sometimes makes this > 1 by a tiny bit.
+        // We want the metric to scale between 0 and 1.0 so we can convert
+        // it to an snr scaled value.
+        if (ssim_term > 1) ssim_term = 1;
+
+        // This converts the consistency metric to an inconsistency metric
+        // ( so we can scale it like psnr to something like sum square error.
+        // The reason for the variance and mean terms is the assumption that
+        // if there are big changes in the source we shouldn't penalize
+        // inconsistency in ssim scores a bit less as it will be less visible
+        // to the user.
+        this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
+
+        this_inconsistency *= kScaling;
+        inconsistency_total += this_inconsistency;
+      }
+      sv2[c] = sv;
+      ssim_total += ssim;
+      ssim2_total += ssim2;
+      dssim_total += dssim;
+
+      old_ssim_total += ssim_old;
+    }
+    old_ssim_total += 0;
+  }
+
+  norm = 1. / (width / 4) / (height / 4);
+  ssim_total *= norm;
+  ssim2_total *= norm;
+  m->ssim2 = ssim2_total;
+  m->ssim = ssim_total;
+  if (old_ssim_total == 0) inconsistency_total = 0;
+
+  m->ssimc = inconsistency_total;
+
+  m->dssim = dssim_total;
+  return inconsistency_total;
+}
+
+double highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                        const YV12_BUFFER_CONFIG *dest, double *weight,
+                        uint32_t bd, uint32_t in_bd) {
+  double a, b, c;
+  double ssimv;
+  uint32_t shift = 0;
+
+  assert(bd >= in_bd);
+  shift = bd - in_bd;
+
+  a = highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
+                   dest->y_stride, source->y_crop_width, source->y_crop_height,
+                   in_bd, shift);
+
+  b = highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
+                   dest->uv_stride, source->uv_crop_width,
+                   source->uv_crop_height, in_bd, shift);
+
+  c = highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
+                   dest->uv_stride, source->uv_crop_width,
+                   source->uv_crop_height, in_bd, shift);
+
+  ssimv = a * .8 + .1 * (b + c);
+
+  *weight = 1;
+
+  return ssimv;
+}
+
+int main(int argc, char *argv[]) {
+  FILE *framestats = NULL;
+  int bit_depth = 8;
+  int w = 0, h = 0, tl_skip = 0, tl_skips_remaining = 0;
+  double ssimavg = 0, ssimyavg = 0, ssimuavg = 0, ssimvavg = 0;
+  double psnrglb = 0, psnryglb = 0, psnruglb = 0, psnrvglb = 0;
+  double psnravg = 0, psnryavg = 0, psnruavg = 0, psnrvavg = 0;
+  double *ssimy = NULL, *ssimu = NULL, *ssimv = NULL;
+  uint64_t *psnry = NULL, *psnru = NULL, *psnrv = NULL;
+  size_t i, n_frames = 0, allocated_frames = 0;
+  int return_value = 0;
+  input_file_t in[2];
+  double peak = 255.0;
+
+  if (argc < 2) {
+    fprintf(stderr,
+            "Usage: %s file1.{yuv|y4m} file2.{yuv|y4m}"
+            "[WxH tl_skip={0,1,3} frame_stats_file bits]\n",
+            argv[0]);
+    return_value = 1;
+    goto clean_up;
+  }
+
+  if (argc > 3) {
+    sscanf(argv[3], "%dx%d", &w, &h);
+  }
+
+  if (argc > 6) {
+    sscanf(argv[6], "%d", &bit_depth);
+  }
+
+  if (open_input_file(argv[1], &in[0], w, h, bit_depth) < 0) {
+    fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]);
+    goto clean_up;
+  }
+
+  if (w == 0 && h == 0) {
+    // If a y4m is the first file and w, h is not set grab from first file.
+    w = in[0].w;
+    h = in[0].h;
+    bit_depth = in[0].bit_depth;
+  }
+  if (bit_depth == 10) peak = 1023.0;
+
+  if (bit_depth == 12) peak = 4095;
+
+  if (open_input_file(argv[2], &in[1], w, h, bit_depth) < 0) {
+    fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]);
+    goto clean_up;
+  }
+
+  if (in[0].w != in[1].w || in[0].h != in[1].h || in[0].w != w ||
+      in[0].h != h || w == 0 || h == 0) {
+    fprintf(stderr,
+            "Failing: Image dimensions don't match or are unspecified!\n");
+    return_value = 1;
+    goto clean_up;
+  }
+
+  // Number of frames to skip from file1.yuv for every frame used. Normal values
+  // 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL encoding
+  // in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer encoding.
+  if (argc > 4) {
+    sscanf(argv[4], "%d", &tl_skip);
+    if (argc > 5) {
+      framestats = fopen(argv[5], "w");
+      if (!framestats) {
+        fprintf(stderr, "Could not open \"%s\" for writing: %s\n", argv[5],
+                strerror(errno));
+        return_value = 1;
+        goto clean_up;
+      }
+    }
+  }
+
+  if (w & 1 || h & 1) {
+    fprintf(stderr, "Invalid size %dx%d\n", w, h);
+    return_value = 1;
+    goto clean_up;
+  }
+
+  while (1) {
+    size_t r1, r2;
+    unsigned char *y[2], *u[2], *v[2];
+
+    r1 = read_input_file(&in[0], &y[0], &u[0], &v[0], bit_depth);
+
+    if (r1) {
+      // Reading parts of file1.yuv that were not used in temporal layer.
+      if (tl_skips_remaining > 0) {
+        --tl_skips_remaining;
+        continue;
+      }
+      // Use frame, but skip |tl_skip| after it.
+      tl_skips_remaining = tl_skip;
+    }
+
+    r2 = read_input_file(&in[1], &y[1], &u[1], &v[1], bit_depth);
+
+    if (r1 && r2 && r1 != r2) {
+      fprintf(stderr, "Failed to read data: %s [%d/%d]\n", strerror(errno),
+              (int)r1, (int)r2);
+      return_value = 1;
+      goto clean_up;
+    } else if (r1 == 0 || r2 == 0) {
+      break;
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h)                            \
+  if (bit_depth < 9) {                                                         \
+    ssim = ssim2(buf0, buf1, w, w, w, h);                                      \
+    psnr = calc_plane_error(buf0, w, buf1, w, w, h);                           \
+  } else {                                                                     \
+    ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), w, \
+                        w, w, h, bit_depth, bit_depth - 8);                    \
+    psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w,                       \
+                              CAST_TO_SHORTPTR(buf1), w, w, h);                \
+  }
+#else
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
+  ssim = ssim2(buf0, buf1, w, w, w, h);             \
+  psnr = calc_plane_error(buf0, w, buf1, w, w, h);
+#endif
+
+    if (n_frames == allocated_frames) {
+      allocated_frames = allocated_frames == 0 ? 1024 : allocated_frames * 2;
+      ssimy = realloc(ssimy, allocated_frames * sizeof(*ssimy));
+      ssimu = realloc(ssimu, allocated_frames * sizeof(*ssimu));
+      ssimv = realloc(ssimv, allocated_frames * sizeof(*ssimv));
+      psnry = realloc(psnry, allocated_frames * sizeof(*psnry));
+      psnru = realloc(psnru, allocated_frames * sizeof(*psnru));
+      psnrv = realloc(psnrv, allocated_frames * sizeof(*psnrv));
+    }
+    psnr_and_ssim(ssimy[n_frames], psnry[n_frames], y[0], y[1], w, h);
+    psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], w / 2, h / 2);
+    psnr_and_ssim(ssimv[n_frames], psnrv[n_frames], v[0], v[1], w / 2, h / 2);
+
+    n_frames++;
+  }
+
+  if (framestats) {
+    fprintf(framestats,
+            "ssim,ssim-y,ssim-u,ssim-v,psnr,psnr-y,psnr-u,psnr-v\n");
+  }
+
+  for (i = 0; i < n_frames; ++i) {
+    double frame_ssim;
+    double frame_psnr, frame_psnry, frame_psnru, frame_psnrv;
+
+    frame_ssim = 0.8 * ssimy[i] + 0.1 * (ssimu[i] + ssimv[i]);
+    ssimavg += frame_ssim;
+    ssimyavg += ssimy[i];
+    ssimuavg += ssimu[i];
+    ssimvavg += ssimv[i];
+
+    frame_psnr =
+        mse2psnr(w * h * 6 / 4, peak, (double)psnry[i] + psnru[i] + psnrv[i]);
+    frame_psnry = mse2psnr(w * h * 4 / 4, peak, (double)psnry[i]);
+    frame_psnru = mse2psnr(w * h * 1 / 4, peak, (double)psnru[i]);
+    frame_psnrv = mse2psnr(w * h * 1 / 4, peak, (double)psnrv[i]);
+
+    psnravg += frame_psnr;
+    psnryavg += frame_psnry;
+    psnruavg += frame_psnru;
+    psnrvavg += frame_psnrv;
+
+    psnryglb += psnry[i];
+    psnruglb += psnru[i];
+    psnrvglb += psnrv[i];
+
+    if (framestats) {
+      fprintf(framestats, "%lf,%lf,%lf,%lf,%lf,%lf,%lf,%lf\n", frame_ssim,
+              ssimy[i], ssimu[i], ssimv[i], frame_psnr, frame_psnry,
+              frame_psnru, frame_psnrv);
+    }
+  }
+
+  ssimavg /= n_frames;
+  ssimyavg /= n_frames;
+  ssimuavg /= n_frames;
+  ssimvavg /= n_frames;
+
+  printf("VpxSSIM: %lf\n", 100 * pow(ssimavg, 8.0));
+  printf("SSIM: %lf\n", ssimavg);
+  printf("SSIM-Y: %lf\n", ssimyavg);
+  printf("SSIM-U: %lf\n", ssimuavg);
+  printf("SSIM-V: %lf\n", ssimvavg);
+  puts("");
+
+  psnravg /= n_frames;
+  psnryavg /= n_frames;
+  psnruavg /= n_frames;
+  psnrvavg /= n_frames;
+
+  printf("AvgPSNR: %lf\n", psnravg);
+  printf("AvgPSNR-Y: %lf\n", psnryavg);
+  printf("AvgPSNR-U: %lf\n", psnruavg);
+  printf("AvgPSNR-V: %lf\n", psnrvavg);
+  puts("");
+
+  psnrglb = psnryglb + psnruglb + psnrvglb;
+  psnrglb = mse2psnr((double)n_frames * w * h * 6 / 4, peak, psnrglb);
+  psnryglb = mse2psnr((double)n_frames * w * h * 4 / 4, peak, psnryglb);
+  psnruglb = mse2psnr((double)n_frames * w * h * 1 / 4, peak, psnruglb);
+  psnrvglb = mse2psnr((double)n_frames * w * h * 1 / 4, peak, psnrvglb);
+
+  printf("GlbPSNR: %lf\n", psnrglb);
+  printf("GlbPSNR-Y: %lf\n", psnryglb);
+  printf("GlbPSNR-U: %lf\n", psnruglb);
+  printf("GlbPSNR-V: %lf\n", psnrvglb);
+  puts("");
+
+  printf("Nframes: %d\n", (int)n_frames);
+
+clean_up:
+
+  close_input_file(&in[0]);
+  close_input_file(&in[1]);
+
+  if (framestats) fclose(framestats);
+
+  free(ssimy);
+  free(ssimu);
+  free(ssimv);
+
+  free(psnry);
+  free(psnru);
+  free(psnrv);
+
+  return return_value;
+}
diff --git a/libs/libvpx/tools_common.h b/libs/libvpx/tools_common.h
index 73ba1bc03b..e41de3195f 100644
--- a/libs/libvpx/tools_common.h
+++ b/libs/libvpx/tools_common.h
@@ -26,11 +26,21 @@
 /* MSVS uses _f{seek,tell}i64. */
 #define fseeko _fseeki64
 #define ftello _ftelli64
+typedef int64_t FileOffset;
 #elif defined(_WIN32)
 /* MinGW uses f{seek,tell}o64 for large files. */
 #define fseeko fseeko64
 #define ftello ftello64
-#endif /* _WIN32 */
+typedef off64_t FileOffset;
+#elif CONFIG_OS_SUPPORT
+typedef off_t FileOffset;
+/* Use 32-bit file operations in WebM file format when building ARM
+ * executables (.axf) with RVCT. */
+#else
+#define fseeko fseek
+#define ftello ftell
+typedef long FileOffset; /* NOLINT */
+#endif /* CONFIG_OS_SUPPORT */
 
 #if CONFIG_OS_SUPPORT
 #if defined(_MSC_VER)
@@ -42,13 +52,6 @@
 #endif              /* _MSC_VER */
 #endif              /* CONFIG_OS_SUPPORT */
 
-/* Use 32-bit file operations in WebM file format when building ARM
- * executables (.axf) with RVCT. */
-#if !CONFIG_OS_SUPPORT
-#define fseeko fseek
-#define ftello ftell
-#endif /* CONFIG_OS_SUPPORT */
-
 #define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo)
 
 #ifndef PATH_MAX
diff --git a/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
index af566c2c41..8520ab5ca0 100644
--- a/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -11,6 +11,7 @@
 #include <arm_neon.h>
 #include <string.h>
 #include "./vpx_config.h"
+#include "vpx_dsp/arm/mem_neon.h"
 
 static const uint8_t bifilter4_coeff[8][2] = { { 128, 0 }, { 112, 16 },
                                                { 96, 32 }, { 80, 48 },
@@ -21,35 +22,6 @@ static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
   return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
 }
 
-static INLINE void store4x4(unsigned char *dst, int dst_stride,
-                            const uint8x8_t a0, const uint8x8_t a1) {
-  if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) {
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1);
-  } else {
-    // Store to the aligned local buffer and memcpy instead of vget_lane_u8
-    // which is really really slow.
-    uint32_t output_buffer[4];
-    vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0);
-    vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1);
-    vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0);
-    vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1);
-
-    memcpy(dst, output_buffer, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 1, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 2, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 3, 4);
-  }
-}
-
 void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr,
                                   int src_pixels_per_line, int xoffset,
                                   int yoffset, unsigned char *dst_ptr,
@@ -122,7 +94,7 @@ void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr,
 
   // secondpass_filter
   if (yoffset == 0) {  // skip_2ndpass_filter
-    store4x4(dst_ptr, dst_pitch, e0, e1);
+    store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
   } else {
     uint8x8_t f0, f1;
     const uint8x8_t filter0 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
@@ -140,7 +112,7 @@ void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr,
     f0 = vqrshrn_n_u16(b0, 7);
     f1 = vqrshrn_n_u16(b1, 7);
 
-    store4x4(dst_ptr, dst_pitch, f0, f1);
+    store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(f0, f1));
   }
 }
 
diff --git a/libs/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/libs/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
index d3a53db21f..d12c3a8392 100644
--- a/libs/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
@@ -10,6 +10,8 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
+
 void vp8_dc_only_idct_add_neon(int16_t input_dc, unsigned char *pred_ptr,
                                int pred_stride, unsigned char *dst_ptr,
                                int dst_stride) {
diff --git a/libs/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/libs/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
index ff5981eaac..5445f2965a 100644
--- a/libs/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
@@ -10,8 +10,14 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
+
 static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2 = 35468;
+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
+// the way it is used in vqdmulh, where the result is doubled, it can be divided
+// by 2 beforehand. This saves compensating for the negative value as well as
+// shifting the result.
+static const int16_t sinpi8sqrt2 = 35468 >> 1;
 
 void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
                                int stride) {
@@ -60,10 +66,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
   q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
   q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
 
-  q3 = vshrq_n_s16(q3, 1);
   q4 = vshrq_n_s16(q4, 1);
 
-  q3 = vqaddq_s16(q3, q2);
   q4 = vqaddq_s16(q4, q2);
 
   d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
@@ -90,10 +94,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
   d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
   d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
 
-  q3 = vshrq_n_s16(q3, 1);
   q4 = vshrq_n_s16(q4, 1);
 
-  q3 = vqaddq_s16(q3, q2);
   q4 = vqaddq_s16(q4, q2);
 
   d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
diff --git a/libs/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/libs/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
index a36c0c1cac..2724ca236b 100644
--- a/libs/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
@@ -10,8 +10,14 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
+
 static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2 = 35468;
+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
+// the way it is used in vqdmulh, where the result is doubled, it can be divided
+// by 2 beforehand. This saves compensating for the negative value as well as
+// shifting the result.
+static const int16_t sinpi8sqrt2 = 35468 >> 1;
 
 void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
                                int pred_stride, unsigned char *dst_ptr,
@@ -40,10 +46,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
   d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
   d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
 
-  q3s16 = vshrq_n_s16(q3s16, 1);
   q4s16 = vshrq_n_s16(q4s16, 1);
 
-  q3s16 = vqaddq_s16(q3s16, q2s16);
   q4s16 = vqaddq_s16(q4s16, q2s16);
 
   d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
@@ -71,10 +75,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
   d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
   d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
 
-  q3s16 = vshrq_n_s16(q3s16, 1);
   q4s16 = vshrq_n_s16(q4s16, 1);
 
-  q3s16 = vqaddq_s16(q3s16, q2s16);
   q4s16 = vqaddq_s16(q4s16, q2s16);
 
   d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
diff --git a/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
index 622baa3c50..aa2567df79 100644
--- a/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
@@ -9,6 +9,9 @@
  */
 
 #include <arm_neon.h>
+#include <string.h>
+#include "./vpx_config.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_ports/mem.h"
 
 static const int8_t vp8_sub_pel_filters[8][8] = {
@@ -22,6 +25,367 @@ static const int8_t vp8_sub_pel_filters[8][8] = {
   { 0, -1, 12, 123, -6, 0, 0, 0 },
 };
 
+// This table is derived from vp8/common/filter.c:vp8_sub_pel_filters.
+// Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive.
+// Elements 1 and 4 are either 0 or negative. The code accounts for this with
+// multiply/accumulates which either add or subtract as needed. The other
+// functions will be updated to use this table later.
+// It is also expanded to 8 elements to allow loading into 64 bit neon
+// registers.
+static const uint8_t abs_filters[8][8] = {
+  { 0, 0, 128, 0, 0, 0, 0, 0 },   { 0, 6, 123, 12, 1, 0, 0, 0 },
+  { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 },
+  { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 },
+  { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 },
+};
+
+static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
+  return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
+}
+
+static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
+                                         const uint8x8_t filter, uint16x8_t *c,
+                                         uint16x8_t *d) {
+  const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
+                                       vreinterpret_u32_u8(vget_high_u8(a)));
+  const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
+                                       vreinterpret_u32_u8(vget_high_u8(b)));
+  *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
+  *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
+}
+
+static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b,
+                                         const uint8x8_t filter, uint16x8_t *c,
+                                         uint16x8_t *d) {
+  const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
+                                       vreinterpret_u32_u8(vget_high_u8(a)));
+  const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
+                                       vreinterpret_u32_u8(vget_high_u8(b)));
+  *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
+  *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
+}
+
+static INLINE void yonly4x4(const unsigned char *src, int src_stride,
+                            int filter_offset, unsigned char *dst,
+                            int dst_stride) {
+  uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8;
+  uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
+  uint16x8_t c0, c1, c2, c3;
+  int16x8_t d0, d1;
+  uint8x8_t e0, e1;
+
+  const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]);
+  const uint8x8_t filter0 = vdup_lane_u8(filter, 0);
+  const uint8x8_t filter1 = vdup_lane_u8(filter, 1);
+  const uint8x8_t filter2 = vdup_lane_u8(filter, 2);
+  const uint8x8_t filter3 = vdup_lane_u8(filter, 3);
+  const uint8x8_t filter4 = vdup_lane_u8(filter, 4);
+  const uint8x8_t filter5 = vdup_lane_u8(filter, 5);
+
+  src -= src_stride * 2;
+  // Shift the even rows to allow using 'vext' to combine the vectors. armv8
+  // has vcopy_lane which would be interesting. This started as just a
+  // horrible workaround for clang adding alignment hints to 32bit loads:
+  // https://llvm.org/bugs/show_bug.cgi?id=24421
+  // But it turns out it almost identical to casting the loads.
+  a0 = load_and_shift(src);
+  src += src_stride;
+  a1 = vld1_u8(src);
+  src += src_stride;
+  a2 = load_and_shift(src);
+  src += src_stride;
+  a3 = vld1_u8(src);
+  src += src_stride;
+  a4 = load_and_shift(src);
+  src += src_stride;
+  a5 = vld1_u8(src);
+  src += src_stride;
+  a6 = load_and_shift(src);
+  src += src_stride;
+  a7 = vld1_u8(src);
+  src += src_stride;
+  a8 = vld1_u8(src);
+
+  // Combine the rows so we can operate on 8 at a time.
+  b0 = vext_u8(a0, a1, 4);
+  b2 = vext_u8(a2, a3, 4);
+  b4 = vext_u8(a4, a5, 4);
+  b6 = vext_u8(a6, a7, 4);
+  b8 = a8;
+
+  // To keep with the 8-at-a-time theme, combine *alternate* rows. This
+  // allows combining the odd rows with the even.
+  b1 = vext_u8(b0, b2, 4);
+  b3 = vext_u8(b2, b4, 4);
+  b5 = vext_u8(b4, b6, 4);
+  b7 = vext_u8(b6, b8, 4);
+
+  // Multiply and expand to 16 bits.
+  c0 = vmull_u8(b0, filter0);
+  c1 = vmull_u8(b2, filter0);
+  c2 = vmull_u8(b5, filter5);
+  c3 = vmull_u8(b7, filter5);
+
+  // Multiply, subtract and accumulate for filters 1 and 4 (the negative
+  // ones).
+  c0 = vmlsl_u8(c0, b4, filter4);
+  c1 = vmlsl_u8(c1, b6, filter4);
+  c2 = vmlsl_u8(c2, b1, filter1);
+  c3 = vmlsl_u8(c3, b3, filter1);
+
+  // Add more positive ones. vmlal should really return a signed type.
+  // It's doing signed math internally, as evidenced by the fact we can do
+  // subtractions followed by more additions. Ideally we could use
+  // vqmlal/sl but that instruction doesn't exist. Might be able to
+  // shoehorn vqdmlal/vqdmlsl in here but it would take some effort.
+  c0 = vmlal_u8(c0, b2, filter2);
+  c1 = vmlal_u8(c1, b4, filter2);
+  c2 = vmlal_u8(c2, b3, filter3);
+  c3 = vmlal_u8(c3, b5, filter3);
+
+  // Use signed saturation math because vmlsl may have left some negative
+  // numbers in there.
+  d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
+  d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
+
+  // Use signed again because numbers like -200 need to be saturated to 0.
+  e0 = vqrshrun_n_s16(d0, 7);
+  e1 = vqrshrun_n_s16(d1, 7);
+
+  store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1));
+}
+
+void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
+                                int xoffset, int yoffset,
+                                unsigned char *dst_ptr, int dst_pitch) {
+  uint8x16_t s0, s1, s2, s3, s4;
+  uint64x2_t s01, s23;
+  // Variables to hold src[] elements for the given filter[]
+  uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5;
+  uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4;
+  uint8x16_t s01_f0, s23_f0;
+  uint64x2_t s01_f3, s23_f3;
+  uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q;
+  // Accumulator variables.
+  uint16x8_t d0123, d4567, d89;
+  uint16x8_t d0123_a, d4567_a, d89_a;
+  int16x8_t e0123, e4567, e89;
+  // Second pass intermediates.
+  uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
+  uint16x8_t c0, c1, c2, c3;
+  int16x8_t d0, d1;
+  uint8x8_t e0, e1;
+  uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5;
+
+  if (xoffset == 0) {  // Second pass only.
+    yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch);
+    return;
+  }
+
+  if (yoffset == 0) {  // First pass only.
+    src_ptr -= 2;
+  } else {  // Add context for the second pass. 2 extra lines on top.
+    src_ptr -= 2 + (src_pixels_per_line * 2);
+  }
+
+  filter = vld1_u8(abs_filters[xoffset]);
+  filter0 = vdup_lane_u8(filter, 0);
+  filter1 = vdup_lane_u8(filter, 1);
+  filter2 = vdup_lane_u8(filter, 2);
+  filter3 = vdup_lane_u8(filter, 3);
+  filter4 = vdup_lane_u8(filter, 4);
+  filter5 = vdup_lane_u8(filter, 5);
+
+  // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of
+  // garbage. So much effort for that last single bit.
+  // The low values of each pair are for filter0.
+  s0 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+  s1 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+  s2 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+  s3 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+
+  // Shift to extract values for filter[5]
+  // If src[] is 0, this puts:
+  // 3 4 5 6 7 8 9 10 in s0_f5
+  // Can't use vshr.u64 because it crosses the double word boundary.
+  s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
+  s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
+  s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
+  s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
+
+  s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
+  s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
+
+  s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
+  s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
+  d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
+  d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
+
+  // Keep original src data as 64 bits to simplify shifting and extracting.
+  s01 = vreinterpretq_u64_u8(s01_f0);
+  s23 = vreinterpretq_u64_u8(s23_f0);
+
+  // 3 4 5 6 * filter0
+  filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
+
+  // Shift over one to use -1, 0, 1, 2 for filter1
+  // -1 0 1 2 * filter1
+  filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
+                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
+                        &d0123, &d4567);
+
+  // 2 3 4 5 * filter4
+  filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
+                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
+                        &d0123, &d4567);
+
+  // 0 1 2 3 * filter2
+  filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
+                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
+                        &d0123, &d4567);
+
+  // 1 2 3 4 * filter3
+  s01_f3 = vshrq_n_u64(s01, 24);
+  s23_f3 = vshrq_n_u64(s23, 24);
+  s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
+                      vreinterpret_u32_u64(vget_high_u64(s01_f3)));
+  s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
+                      vreinterpret_u32_u64(vget_high_u64(s23_f3)));
+  // Accumulate into different registers so it can use saturated addition.
+  d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
+  d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
+
+  e0123 =
+      vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
+  e4567 =
+      vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
+
+  // Shift and narrow.
+  b0 = vqrshrun_n_s16(e0123, 7);
+  b2 = vqrshrun_n_s16(e4567, 7);
+
+  if (yoffset == 0) {  // firstpass_filter4x4_only
+    store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2));
+    return;
+  }
+
+  // Load additional context when doing both filters.
+  s0 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+  s1 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+  s2 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+  s3 = vld1q_u8(src_ptr);
+  src_ptr += src_pixels_per_line;
+  s4 = vld1q_u8(src_ptr);
+
+  s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
+  s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
+  s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
+  s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
+  s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5);
+
+  // 3 4 5 6 * filter0
+  s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
+  s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
+
+  s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
+  s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
+  // But this time instead of 16 pixels to filter, there are 20. So an extra
+  // run with a doubleword register.
+  d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
+  d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
+  d89 = vmull_u8(s4_f5, filter5);
+
+  // Save a copy as u64 for shifting.
+  s01 = vreinterpretq_u64_u8(s01_f0);
+  s23 = vreinterpretq_u64_u8(s23_f0);
+
+  filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
+  d89 = vmlal_u8(d89, vget_low_u8(s4), filter0);
+
+  filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
+                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
+                        &d0123, &d4567);
+  s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1);
+  d89 = vmlsl_u8(d89, s4_f1, filter1);
+
+  filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
+                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
+                        &d0123, &d4567);
+  s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4);
+  d89 = vmlsl_u8(d89, s4_f4, filter4);
+
+  filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
+                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
+                        &d0123, &d4567);
+  s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2);
+  d89 = vmlal_u8(d89, s4_f2, filter2);
+
+  s01_f3 = vshrq_n_u64(s01, 24);
+  s23_f3 = vshrq_n_u64(s23, 24);
+  s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
+                      vreinterpret_u32_u64(vget_high_u64(s01_f3)));
+  s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
+                      vreinterpret_u32_u64(vget_high_u64(s23_f3)));
+  s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3);
+  d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
+  d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
+  d89_a = vmull_u8(s4_f3, filter3);
+
+  e0123 =
+      vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
+  e4567 =
+      vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
+  e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a));
+
+  b4 = vqrshrun_n_s16(e0123, 7);
+  b6 = vqrshrun_n_s16(e4567, 7);
+  b8 = vqrshrun_n_s16(e89, 7);
+
+  // Second pass: 4x4
+  filter = vld1_u8(abs_filters[yoffset]);
+  filter0 = vdup_lane_u8(filter, 0);
+  filter1 = vdup_lane_u8(filter, 1);
+  filter2 = vdup_lane_u8(filter, 2);
+  filter3 = vdup_lane_u8(filter, 3);
+  filter4 = vdup_lane_u8(filter, 4);
+  filter5 = vdup_lane_u8(filter, 5);
+
+  b1 = vext_u8(b0, b2, 4);
+  b3 = vext_u8(b2, b4, 4);
+  b5 = vext_u8(b4, b6, 4);
+  b7 = vext_u8(b6, b8, 4);
+
+  c0 = vmull_u8(b0, filter0);
+  c1 = vmull_u8(b2, filter0);
+  c2 = vmull_u8(b5, filter5);
+  c3 = vmull_u8(b7, filter5);
+
+  c0 = vmlsl_u8(c0, b4, filter4);
+  c1 = vmlsl_u8(c1, b6, filter4);
+  c2 = vmlsl_u8(c2, b1, filter1);
+  c3 = vmlsl_u8(c3, b3, filter1);
+
+  c0 = vmlal_u8(c0, b2, filter2);
+  c1 = vmlal_u8(c1, b4, filter2);
+  c2 = vmlal_u8(c2, b3, filter3);
+  c3 = vmlal_u8(c3, b5, filter3);
+
+  d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
+  d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
+
+  e0 = vqrshrun_n_s16(d0, 7);
+  e1 = vqrshrun_n_s16(d1, 7);
+
+  store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
+}
+
 void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
                                 int xoffset, int yoffset,
                                 unsigned char *dst_ptr, int dst_pitch) {
diff --git a/libs/libvpx/vp8/common/blockd.h b/libs/libvpx/vp8/common/blockd.h
index 74fc5d6dbf..1a3aad16af 100644
--- a/libs/libvpx/vp8/common/blockd.h
+++ b/libs/libvpx/vp8/common/blockd.h
@@ -169,6 +169,11 @@ typedef struct {
 typedef struct {
   FRAME_TYPE frame_type;
   int is_frame_dropped;
+  // If frame is dropped due to overshoot after encode_frame. This triggers a
+  // drop and resets rate control with Q forced to max for following frame.
+  // The check for this dropping due to overshoot is only done on lowest stream,
+  // and if set will force drop on all spatial streams for that current frame.
+  int is_frame_dropped_overshoot_maxqp;
   // The frame rate for the lowest resolution.
   double low_res_framerate;
   /* The frame number of each reference frames */
diff --git a/libs/libvpx/vp8/common/entropymode.c b/libs/libvpx/vp8/common/entropymode.c
index 30c2fa86ae..239492a8cb 100644
--- a/libs/libvpx/vp8/common/entropymode.c
+++ b/libs/libvpx/vp8/common/entropymode.c
@@ -34,12 +34,13 @@ int vp8_mv_cont(const int_mv *l, const int_mv *a) {
 
 static const vp8_prob sub_mv_ref_prob[VP8_SUBMVREFS - 1] = { 180, 162, 25 };
 
-const vp8_prob vp8_sub_mv_ref_prob2[SUBMVREF_COUNT]
-                                   [VP8_SUBMVREFS - 1] = { { 147, 136, 18 },
-                                                           { 106, 145, 1 },
-                                                           { 179, 121, 1 },
-                                                           { 223, 1, 34 },
-                                                           { 208, 1, 1 } };
+const vp8_prob vp8_sub_mv_ref_prob2[SUBMVREF_COUNT][VP8_SUBMVREFS - 1] = {
+  { 147, 136, 18 },
+  { 106, 145, 1 },
+  { 179, 121, 1 },
+  { 223, 1, 34 },
+  { 208, 1, 1 }
+};
 
 const vp8_mbsplit vp8_mbsplits[VP8_NUMMBSPLITS] = {
   { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 },
diff --git a/libs/libvpx/vp8/common/entropymode.h b/libs/libvpx/vp8/common/entropymode.h
index e0a17df10c..b3fad19be0 100644
--- a/libs/libvpx/vp8/common/entropymode.h
+++ b/libs/libvpx/vp8/common/entropymode.h
@@ -78,8 +78,8 @@ extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES - 1];
 
 void vp8_init_mbmode_probs(VP8_COMMON *x);
 void vp8_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES - 1]);
-void vp8_kf_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES][VP8_BINTRAMODES]
-                                             [VP8_BINTRAMODES - 1]);
+void vp8_kf_default_bmode_probs(
+    vp8_prob dest[VP8_BINTRAMODES][VP8_BINTRAMODES][VP8_BINTRAMODES - 1]);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libs/libvpx/vp8/common/generic/systemdependent.c b/libs/libvpx/vp8/common/generic/systemdependent.c
index 89abd41c09..0432ee9887 100644
--- a/libs/libvpx/vp8/common/generic/systemdependent.c
+++ b/libs/libvpx/vp8/common/generic/systemdependent.c
@@ -14,6 +14,8 @@
 #include "vpx_ports/arm.h"
 #elif ARCH_X86 || ARCH_X86_64
 #include "vpx_ports/x86.h"
+#elif ARCH_PPC
+#include "vpx_ports/ppc.h"
 #endif
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/systemdependent.h"
@@ -83,8 +85,6 @@ static int get_cpu_count() {
 }
 #endif
 
-void vp8_clear_system_state_c(){};
-
 void vp8_machine_specific_config(VP8_COMMON *ctx) {
 #if CONFIG_MULTITHREAD
   ctx->processor_core_count = get_cpu_count();
@@ -96,5 +96,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) {
   ctx->cpu_caps = arm_cpu_caps();
 #elif ARCH_X86 || ARCH_X86_64
   ctx->cpu_caps = x86_simd_caps();
+#elif ARCH_PPC
+  ctx->cpu_caps = ppc_simd_caps();
 #endif
 }
diff --git a/libs/libvpx/vp8/common/loopfilter_filters.c b/libs/libvpx/vp8/common/loopfilter_filters.c
index 1f60721e1c..188e290ca7 100644
--- a/libs/libvpx/vp8/common/loopfilter_filters.c
+++ b/libs/libvpx/vp8/common/loopfilter_filters.c
@@ -63,8 +63,8 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0,
   filter_value &= mask;
 
   /* save bottom 3 bits so that we round one side +4 and the other +3
-   * if it equals 4 we'll set to adjust by -1 to account for the fact
-   * we'd round 3 the other way
+   * if it equals 4 we'll set it to adjust by -1 to account for the fact
+   * we'd round it by 3 the other way
    */
   Filter1 = vp8_signed_char_clamp(filter_value + 4);
   Filter2 = vp8_signed_char_clamp(filter_value + 3);
@@ -86,10 +86,12 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0,
   u = vp8_signed_char_clamp(ps1 + filter_value);
   *op1 = u ^ 0x80;
 }
-void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
-                                       const unsigned char *blimit,
-                                       const unsigned char *limit,
-                                       const unsigned char *thresh, int count) {
+
+static void loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
+                                          const unsigned char *blimit,
+                                          const unsigned char *limit,
+                                          const unsigned char *thresh,
+                                          int count) {
   int hev = 0; /* high edge variance */
   signed char mask = 0;
   int i = 0;
@@ -109,10 +111,11 @@ void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
   } while (++i < count * 8);
 }
 
-void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p,
-                                     const unsigned char *blimit,
-                                     const unsigned char *limit,
-                                     const unsigned char *thresh, int count) {
+static void loop_filter_vertical_edge_c(unsigned char *s, int p,
+                                        const unsigned char *blimit,
+                                        const unsigned char *limit,
+                                        const unsigned char *thresh,
+                                        int count) {
   int hev = 0; /* high edge variance */
   signed char mask = 0;
   int i = 0;
@@ -185,11 +188,11 @@ static void vp8_mbfilter(signed char mask, uc hev, uc *op2, uc *op1, uc *op0,
   *op2 = s ^ 0x80;
 }
 
-void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
-                                         const unsigned char *blimit,
-                                         const unsigned char *limit,
-                                         const unsigned char *thresh,
-                                         int count) {
+static void mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
+                                            const unsigned char *blimit,
+                                            const unsigned char *limit,
+                                            const unsigned char *thresh,
+                                            int count) {
   signed char hev = 0; /* high edge variance */
   signed char mask = 0;
   int i = 0;
@@ -210,10 +213,11 @@ void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
   } while (++i < count * 8);
 }
 
-void vp8_mbloop_filter_vertical_edge_c(unsigned char *s, int p,
-                                       const unsigned char *blimit,
-                                       const unsigned char *limit,
-                                       const unsigned char *thresh, int count) {
+static void mbloop_filter_vertical_edge_c(unsigned char *s, int p,
+                                          const unsigned char *blimit,
+                                          const unsigned char *limit,
+                                          const unsigned char *thresh,
+                                          int count) {
   signed char hev = 0; /* high edge variance */
   signed char mask = 0;
   int i = 0;
@@ -295,17 +299,17 @@ void vp8_loop_filter_simple_vertical_edge_c(unsigned char *s, int p,
 void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
                            unsigned char *v_ptr, int y_stride, int uv_stride,
                            loop_filter_info *lfi) {
-  vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 2);
+  mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 2);
 
   if (u_ptr) {
-    vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                        lfi->hev_thr, 1);
+    mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1);
   }
 
   if (v_ptr) {
-    vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                        lfi->hev_thr, 1);
+    mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1);
   }
 }
 
@@ -313,17 +317,17 @@ void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
 void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
                            unsigned char *v_ptr, int y_stride, int uv_stride,
                            loop_filter_info *lfi) {
-  vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, 2);
+  mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 2);
 
   if (u_ptr) {
-    vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1);
+    mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 1);
   }
 
   if (v_ptr) {
-    vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1);
+    mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 1);
   }
 }
 
@@ -331,21 +335,21 @@ void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
 void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
                           unsigned char *v_ptr, int y_stride, int uv_stride,
                           loop_filter_info *lfi) {
-  vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim,
-                                    lfi->lim, lfi->hev_thr, 2);
-  vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim,
-                                    lfi->lim, lfi->hev_thr, 2);
-  vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim,
-                                    lfi->lim, lfi->hev_thr, 2);
+  loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim,
+                                lfi->lim, lfi->hev_thr, 2);
+  loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim,
+                                lfi->lim, lfi->hev_thr, 2);
+  loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim,
+                                lfi->lim, lfi->hev_thr, 2);
 
   if (u_ptr) {
-    vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+    loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim,
+                                  lfi->lim, lfi->hev_thr, 1);
   }
 
   if (v_ptr) {
-    vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+    loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim,
+                                  lfi->lim, lfi->hev_thr, 1);
   }
 }
 
@@ -363,21 +367,21 @@ void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
 void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
                           unsigned char *v_ptr, int y_stride, int uv_stride,
                           loop_filter_info *lfi) {
-  vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
-                                  lfi->hev_thr, 2);
-  vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
-                                  lfi->hev_thr, 2);
-  vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
-                                  lfi->hev_thr, 2);
+  loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+                              lfi->hev_thr, 2);
+  loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+                              lfi->hev_thr, 2);
+  loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+                              lfi->hev_thr, 2);
 
   if (u_ptr) {
-    vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
-                                    lfi->hev_thr, 1);
+    loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                lfi->hev_thr, 1);
   }
 
   if (v_ptr) {
-    vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
-                                    lfi->hev_thr, 1);
+    loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                lfi->hev_thr, 1);
   }
 }
 
diff --git a/libs/libvpx/vp8/common/mfqe.c b/libs/libvpx/vp8/common/mfqe.c
index 5aace8c99d..b6f8146b84 100644
--- a/libs/libvpx/vp8/common/mfqe.c
+++ b/libs/libvpx/vp8/common/mfqe.c
@@ -74,8 +74,7 @@ static void apply_ifactor(unsigned char *y_src, int y_src_stride,
                             src_weight);
     vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride,
                             src_weight);
-  } else /* if (block_size == 8) */
-  {
+  } else {
     vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride,
                             src_weight);
     vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride,
@@ -136,8 +135,7 @@ static void multiframe_quality_enhance_block(
     usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6;
     vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride) + 32) >> 6;
 #endif
-  } else /* if (blksize == 8) */
-  {
+  } else {
     actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse) + 32) >> 6;
     act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse) + 32) >> 6;
 #ifdef USE_SSD
@@ -186,14 +184,12 @@ static void multiframe_quality_enhance_block(
       apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
                     uvd_stride, blksize, ifactor);
     }
-  } else /* else implicitly copy from previous frame */
-  {
+  } else { /* else implicitly copy from previous frame */
     if (blksize == 16) {
       vp8_copy_mem16x16(y, y_stride, yd, yd_stride);
       vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride);
       vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride);
-    } else /* if (blksize == 8) */
-    {
+    } else {
       vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
       for (up = u, udp = ud, i = 0; i < uvblksize;
            ++i, up += uv_stride, udp += uvd_stride) {
@@ -297,8 +293,7 @@ void vp8_multiframe_quality_enhance(VP8_COMMON *cm) {
               }
             }
           }
-        } else /* totmap = 4 */
-        {
+        } else { /* totmap = 4 */
           multiframe_quality_enhance_block(
               16, qcurr, qprev, y_ptr, u_ptr, v_ptr, show->y_stride,
               show->uv_stride, yd_ptr, ud_ptr, vd_ptr, dest->y_stride,
diff --git a/libs/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/libs/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
index 7612024b7d..e46827b0e4 100644
--- a/libs/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
+++ b/libs/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
@@ -673,9 +673,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
 
         : [tn1] "=&r"(tn1), [tp2] "=&r"(tp2), [n2] "=&r"(n2), [p4] "=&r"(p4),
           [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
+          [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p1] "+r"(p1)
         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1),
-          [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), [p2] "r"(p2),
+          [n1] "r"(n1), [vector4a] "r"(vector4a), [p2] "r"(p2),
           [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
           [src_ptr] "r"(src_ptr));
 
@@ -724,9 +724,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
 
         : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [n1] "=&r"(n1), [p3] "=&r"(p3),
           [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
+          [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p4] "+r"(p4)
         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp2] "r"(tp2),
-          [p2] "r"(p2), [n2] "r"(n2), [p4] "r"(p4), [n4] "r"(n4), [p1] "r"(p1),
+          [p2] "r"(p2), [n2] "r"(n2), [n4] "r"(n4), [p1] "r"(p1),
           [src_ptr] "r"(src_ptr), [vector4a] "r"(vector4a),
           [vector3b] "r"(vector3b));
 
@@ -781,9 +781,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
 
         : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), [n4] "=&r"(n4),
           [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-          [Temp4] "=r"(Temp4)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1),
-          [p4] "r"(p4), [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a),
+          [Temp4] "=r"(Temp4), [tp1] "+r"(tp1)
+        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [p4] "r"(p4),
+          [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a),
           [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
           [src_ptr] "r"(src_ptr), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
 
@@ -1469,6 +1469,7 @@ void vp8_filter_block2d_second_pass_8(unsigned char *RESTRICT src_ptr,
   unsigned char src_ptr_r2;
   unsigned char src_ptr_r3;
   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+  (void)output_width;
 
   vector4a = 64;
 
diff --git a/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c b/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
index b79af1cc88..d2c3442515 100644
--- a/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
+++ b/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
@@ -306,6 +306,7 @@ void vp8_loop_filter_horizontal_edge_mips(unsigned char *s, int p,
   uint32_t hev;
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  (void)count;
 
   mask = 0;
   hev = 0;
@@ -498,6 +499,7 @@ void vp8_loop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
   uint32_t hev;
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  (void)count;
 
   mask = 0;
   hev = 0;
@@ -918,6 +920,7 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *s1, *s2, *s3, *s4;
   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  (void)count;
 
   /* loop filter designed to work using chars so that we can make maximum use
    * of 8 bit simd instructions.
@@ -1612,6 +1615,7 @@ void vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
   uint32_t mask, hev;
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  (void)count;
 
   mask = 0;
   hev = 0;
@@ -1915,6 +1919,7 @@ void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p,
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *s1, *s2, *s3, *s4;
   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  (void)count;
 
   mask = 0;
   hev = 0;
diff --git a/libs/libvpx/vp8/common/mips/mmi/copymem_mmi.c b/libs/libvpx/vp8/common/mips/mmi/copymem_mmi.c
new file mode 100644
index 0000000000..86a32aa9ef
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/mmi/copymem_mmi.c
@@ -0,0 +1,114 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+#define COPY_MEM_16X2 \
+  "gsldlc1    %[ftmp0],   0x07(%[src])                    \n\t" \
+  "gsldrc1    %[ftmp0],   0x00(%[src])                    \n\t" \
+  "ldl        %[tmp0],    0x0f(%[src])                    \n\t" \
+  "ldr        %[tmp0],    0x08(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+  "gssdlc1    %[ftmp0],   0x07(%[dst])                    \n\t" \
+  "gssdrc1    %[ftmp0],   0x00(%[dst])                    \n\t" \
+  "sdl        %[tmp0],    0x0f(%[dst])                    \n\t" \
+  "sdr        %[tmp0],    0x08(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],      %[dst],        %[dst_stride])           \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                    \n\t" \
+  "ldl        %[tmp1],    0x0f(%[src])                    \n\t" \
+  "ldr        %[tmp1],    0x08(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+  "gssdlc1    %[ftmp1],   0x07(%[dst])                    \n\t" \
+  "gssdrc1    %[ftmp1],   0x00(%[dst])                    \n\t" \
+  "sdl        %[tmp1],    0x0f(%[dst])                    \n\t" \
+  "sdr        %[tmp1],    0x08(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],     %[dst],         %[dst_stride])
+
+#define COPY_MEM_8X2 \
+  "gsldlc1    %[ftmp0],   0x07(%[src])                    \n\t" \
+  "gsldrc1    %[ftmp0],   0x00(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+  "ldl        %[tmp0],    0x07(%[src])                    \n\t" \
+  "ldr        %[tmp0],    0x00(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+                                                                \
+  "gssdlc1    %[ftmp0],   0x07(%[dst])                    \n\t" \
+  "gssdrc1    %[ftmp0],   0x00(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],      %[dst],        %[dst_stride])           \
+  "sdl        %[tmp0],    0x07(%[dst])                    \n\t" \
+  "sdr        %[tmp0],    0x00(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],     %[dst],         %[dst_stride])
+
+void vp8_copy_mem16x16_mmi(unsigned char *src, int src_stride,
+                           unsigned char *dst, int dst_stride) {
+  double ftmp[2];
+  uint64_t tmp[2];
+  uint8_t loop_count = 4;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "1:                                                     \n\t"
+    COPY_MEM_16X2
+    COPY_MEM_16X2
+    MMI_ADDIU(%[loop_count], %[loop_count], -0x01)
+    "bnez       %[loop_count],    1b                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+      [loop_count]"+&r"(loop_count),
+      [dst]"+&r"(dst),                  [src]"+&r"(src)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+void vp8_copy_mem8x8_mmi(unsigned char *src, int src_stride, unsigned char *dst,
+                         int dst_stride) {
+  double ftmp[2];
+  uint64_t tmp[1];
+  uint8_t loop_count = 4;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "1:                                                     \n\t"
+    COPY_MEM_8X2
+    MMI_ADDIU(%[loop_count], %[loop_count], -0x01)
+    "bnez       %[loop_count],    1b                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [tmp0]"=&r"(tmp[0]),              [loop_count]"+&r"(loop_count),
+      [dst]"+&r"(dst),                  [src]"+&r"(src)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+void vp8_copy_mem8x4_mmi(unsigned char *src, int src_stride, unsigned char *dst,
+                         int dst_stride) {
+  double ftmp[2];
+  uint64_t tmp[1];
+
+  /* clang-format off */
+  __asm__ volatile (
+    COPY_MEM_8X2
+    COPY_MEM_8X2
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [tmp0]"=&r"(tmp[0]),
+      [dst]"+&r"(dst),                  [src]"+&r"(src)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
diff --git a/libs/libvpx/vp8/common/mips/mmi/dequantize_mmi.c b/libs/libvpx/vp8/common/mips/mmi/dequantize_mmi.c
new file mode 100644
index 0000000000..b3f8084aee
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/mmi/dequantize_mmi.c
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vp8_dequantize_b_mmi(BLOCKD *d, int16_t *DQC) {
+  double ftmp[8];
+
+  __asm__ volatile(
+      "gsldlc1    %[ftmp0],   0x07(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp0],   0x00(%[qcoeff])                 \n\t"
+      "gsldlc1    %[ftmp1],   0x0f(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp1],   0x08(%[qcoeff])                 \n\t"
+      "gsldlc1    %[ftmp2],   0x17(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp2],   0x10(%[qcoeff])                 \n\t"
+      "gsldlc1    %[ftmp3],   0x1f(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp3],   0x18(%[qcoeff])                 \n\t"
+
+      "gsldlc1    %[ftmp4],   0x07(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp4],   0x00(%[DQC])                    \n\t"
+      "gsldlc1    %[ftmp5],   0x0f(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp5],   0x08(%[DQC])                    \n\t"
+      "gsldlc1    %[ftmp6],   0x17(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp6],   0x10(%[DQC])                    \n\t"
+      "gsldlc1    %[ftmp7],   0x1f(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp7],   0x18(%[DQC])                    \n\t"
+
+      "pmullh     %[ftmp0],   %[ftmp0],       %[ftmp4]        \n\t"
+      "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp5]        \n\t"
+      "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp6]        \n\t"
+      "pmullh     %[ftmp3],   %[ftmp3],       %[ftmp7]        \n\t"
+
+      "gssdlc1    %[ftmp0],   0x07(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp0],   0x00(%[dqcoeff])                \n\t"
+      "gssdlc1    %[ftmp1],   0x0f(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp1],   0x08(%[dqcoeff])                \n\t"
+      "gssdlc1    %[ftmp2],   0x17(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp2],   0x10(%[dqcoeff])                \n\t"
+      "gssdlc1    %[ftmp3],   0x1f(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp3],   0x18(%[dqcoeff])                \n\t"
+      : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+        [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+        [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7])
+      : [dqcoeff] "r"(d->dqcoeff), [qcoeff] "r"(d->qcoeff), [DQC] "r"(DQC)
+      : "memory");
+}
+
+void vp8_dequant_idct_add_mmi(int16_t *input, int16_t *dq, unsigned char *dest,
+                              int stride) {
+  double ftmp[8];
+
+  __asm__ volatile(
+      "gsldlc1    %[ftmp0],   0x07(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp0],   0x00(%[dq])                     \n\t"
+      "gsldlc1    %[ftmp1],   0x0f(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp1],   0x08(%[dq])                     \n\t"
+      "gsldlc1    %[ftmp2],   0x17(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp2],   0x10(%[dq])                     \n\t"
+      "gsldlc1    %[ftmp3],   0x1f(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp3],   0x18(%[dq])                     \n\t"
+
+      "gsldlc1    %[ftmp4],   0x07(%[input])                  \n\t"
+      "gsldrc1    %[ftmp4],   0x00(%[input])                  \n\t"
+      "gsldlc1    %[ftmp5],   0x0f(%[input])                  \n\t"
+      "gsldrc1    %[ftmp5],   0x08(%[input])                  \n\t"
+      "gsldlc1    %[ftmp6],   0x17(%[input])                  \n\t"
+      "gsldrc1    %[ftmp6],   0x10(%[input])                  \n\t"
+      "gsldlc1    %[ftmp7],   0x1f(%[input])                  \n\t"
+      "gsldrc1    %[ftmp7],   0x18(%[input])                  \n\t"
+
+      "pmullh     %[ftmp0],   %[ftmp0],       %[ftmp4]        \n\t"
+      "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp5]        \n\t"
+      "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp6]        \n\t"
+      "pmullh     %[ftmp3],   %[ftmp3],       %[ftmp7]        \n\t"
+
+      "gssdlc1    %[ftmp0],   0x07(%[input])                  \n\t"
+      "gssdrc1    %[ftmp0],   0x00(%[input])                  \n\t"
+      "gssdlc1    %[ftmp1],   0x0f(%[input])                  \n\t"
+      "gssdrc1    %[ftmp1],   0x08(%[input])                  \n\t"
+      "gssdlc1    %[ftmp2],   0x17(%[input])                  \n\t"
+      "gssdrc1    %[ftmp2],   0x10(%[input])                  \n\t"
+      "gssdlc1    %[ftmp3],   0x1f(%[input])                  \n\t"
+      "gssdrc1    %[ftmp3],   0x18(%[input])                  \n\t"
+      : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+        [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+        [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7])
+      : [dq] "r"(dq), [input] "r"(input)
+      : "memory");
+
+  vp8_short_idct4x4llm_mmi(input, dest, stride, dest, stride);
+
+  __asm__ volatile(
+      "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+      "gssdlc1    %[ftmp0],   0x07(%[input])                  \n\t"
+      "gssdrc1    %[ftmp0],   0x00(%[input])                  \n\t"
+      "sdl        $0,         0x0f(%[input])                  \n\t"
+      "sdr        $0,         0x08(%[input])                  \n\t"
+      "gssdlc1    %[ftmp0],   0x17(%[input])                  \n\t"
+      "gssdrc1    %[ftmp0],   0x10(%[input])                  \n\t"
+      "sdl        $0,         0x1f(%[input])                  \n\t"
+      "sdr        $0,         0x18(%[input])                  \n\t"
+      : [ftmp0] "=&f"(ftmp[0])
+      : [input] "r"(input)
+      : "memory");
+}
diff --git a/libs/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c b/libs/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
new file mode 100644
index 0000000000..f6020ab468
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
+                                      int stride, int8_t *eobs) {
+  int i, j;
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      if (*eobs++ > 1) {
+        vp8_dequant_idct_add_mmi(q, dq, dst, stride);
+      } else {
+        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst, stride, dst, stride);
+        memset(q, 0, 2 * sizeof(q[0]));
+      }
+
+      q += 16;
+      dst += 4;
+    }
+
+    dst += 4 * stride - 16;
+  }
+}
+
+void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu,
+                                       uint8_t *dstv, int stride,
+                                       int8_t *eobs) {
+  int i, j;
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1) {
+        vp8_dequant_idct_add_mmi(q, dq, dstu, stride);
+      } else {
+        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstu, stride, dstu, stride);
+        memset(q, 0, 2 * sizeof(q[0]));
+      }
+
+      q += 16;
+      dstu += 4;
+    }
+
+    dstu += 4 * stride - 8;
+  }
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1) {
+        vp8_dequant_idct_add_mmi(q, dq, dstv, stride);
+      } else {
+        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstv, stride, dstv, stride);
+        memset(q, 0, 2 * sizeof(q[0]));
+      }
+
+      q += 16;
+      dstv += 4;
+    }
+
+    dstv += 4 * stride - 8;
+  }
+}
diff --git a/libs/libvpx/vp8/common/mips/mmi/idctllm_mmi.c b/libs/libvpx/vp8/common/mips/mmi/idctllm_mmi.c
new file mode 100644
index 0000000000..5e48f59166
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/mmi/idctllm_mmi.c
@@ -0,0 +1,328 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+#define TRANSPOSE_4H \
+  "xor           %[ftmp0],    %[ftmp0],    %[ftmp0]          \n\t" \
+  MMI_LI(%[tmp0], 0x93)                                            \
+  "mtc1          %[tmp0],     %[ftmp10]                      \n\t" \
+  "punpcklhw     %[ftmp5],    %[ftmp1],    %[ftmp0]          \n\t" \
+  "punpcklhw     %[ftmp9],    %[ftmp2],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "or            %[ftmp5],    %[ftmp5],    %[ftmp9]          \n\t" \
+  "punpckhhw     %[ftmp6],    %[ftmp1],    %[ftmp0]          \n\t" \
+  "punpckhhw     %[ftmp9],    %[ftmp2],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "or            %[ftmp6],    %[ftmp6],    %[ftmp9]          \n\t" \
+  "punpcklhw     %[ftmp7],    %[ftmp3],    %[ftmp0]          \n\t" \
+  "punpcklhw     %[ftmp9],    %[ftmp4],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "or            %[ftmp7],    %[ftmp7],    %[ftmp9]          \n\t" \
+  "punpckhhw     %[ftmp8],    %[ftmp3],    %[ftmp0]          \n\t" \
+  "punpckhhw     %[ftmp9],    %[ftmp4],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "or            %[ftmp8],    %[ftmp8],    %[ftmp9]          \n\t" \
+  "punpcklwd     %[ftmp1],    %[ftmp5],    %[ftmp7]          \n\t" \
+  "punpckhwd     %[ftmp2],    %[ftmp5],    %[ftmp7]          \n\t" \
+  "punpcklwd     %[ftmp3],    %[ftmp6],    %[ftmp8]          \n\t" \
+  "punpckhwd     %[ftmp4],    %[ftmp6],    %[ftmp8]          \n\t"
+
+void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
+                              int pred_stride, unsigned char *dst_ptr,
+                              int dst_stride) {
+  double ftmp[12];
+  uint32_t tmp[0];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_04) = { 0x0004000400040004ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = { 0x4e7b4e7b4e7b4e7bULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = { 0x22a322a322a322a3ULL };
+
+  __asm__ volatile (
+    MMI_LI(%[tmp0], 0x02)
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp3],   0x17(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x10(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp4],   0x1f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp4],   0x18(%[ip])                         \n\t"
+
+    // ip[0...3] + ip[8...11]
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
+    // ip[0...3] - ip[8...11]
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
+    // (ip[12...15] * sinpi8sqrt2) >> 16
+    "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp7],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    // (ip[ 4... 7] * sinpi8sqrt2) >> 16
+    "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    // ip[ 4... 7] + ((ip[ 4... 7] * cospi8sqrt2minus1) >> 16)
+    "pmulhh     %[ftmp9],   %[ftmp2],       %[ff_ph_4e7b]       \n\t"
+    "paddh      %[ftmp9],   %[ftmp9],       %[ftmp2]            \n\t"
+    // ip[12...15] + ((ip[12...15] * cospi8sqrt2minus1) >> 16)
+    "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
+    "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
+    "psubh      %[ftmp2],   %[ftmp2],       %[ftmp10]           \n\t"
+    "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
+    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
+
+    TRANSPOSE_4H
+    // a
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
+    // b
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
+    // c
+    "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    "psubh      %[ftmp7],   %[ftmp9],       %[ftmp4]            \n\t"
+    "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
+    "psubh      %[ftmp7],   %[ftmp7],       %[ftmp10]           \n\t"
+    // d
+    "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    "paddh      %[ftmp8],   %[ftmp9],       %[ftmp2]            \n\t"
+    "pmulhh     %[ftmp10],  %[ftmp2],       %[ff_ph_4e7b]       \n\t"
+    "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t"
+
+    MMI_LI(%[tmp0], 0x03)
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    // a + d
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    // b + c
+    "paddh      %[ftmp2],   %[ftmp6],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    // b - c
+    "psubh      %[ftmp3],   %[ftmp6],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    // a - d
+    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    TRANSPOSE_4H
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
+#else
+    "gslwlc1    %[ftmp5],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp5],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+    "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                    \n\t"
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
+#else
+    "gslwlc1    %[ftmp6],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp6],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+    "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp2],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp2],   0x00(%[dst_ptr])                    \n\t"
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+#else
+    "gslwlc1    %[ftmp7],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp7],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+    "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp3],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp3],   0x00(%[dst_ptr])                    \n\t"
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
+#else
+    "gslwlc1    %[ftmp8],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp8],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
+    "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp4],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp4],   0x00(%[dst_ptr])                    \n\t"
+    : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+      [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+      [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
+      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]),
+      [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr)
+    : [ip]"r"(input), [ff_ph_22a3]"f"(ff_ph_22a3),
+      [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_04]"f"(ff_ph_04),
+      [pred_stride]"r"((mips_reg)pred_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+}
+
+void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
+                              int pred_stride, unsigned char *dst_ptr,
+                              int dst_stride) {
+  int a1 = ((input_dc + 4) >> 3);
+  double ftmp[5];
+  int low32;
+
+  __asm__ volatile (
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+    "pshufh     %[a1],      %[a1],          %[ftmp0]        \n\t"
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+    : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+      [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32),
+      [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr)
+    : [dst_stride]"r"((mips_reg)dst_stride),
+      [pred_stride]"r"((mips_reg)pred_stride), [a1]"f"(a1)
+    : "memory"
+  );
+}
+
+void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
+  int i;
+  int16_t output[16];
+  double ftmp[12];
+  uint32_t tmp[1];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_03) = { 0x0003000300030003ULL };
+
+  __asm__ volatile (
+    MMI_LI(%[tmp0], 0x03)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp3],   0x17(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x10(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp4],   0x1f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp4],   0x18(%[ip])                         \n\t"
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp2]            \n\t"
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+    "paddh      %[ftmp7],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubh      %[ftmp8],   %[ftmp3],       %[ftmp4]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp2],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp4],   %[ftmp6],       %[ftmp8]            \n\t"
+
+    TRANSPOSE_4H
+    // a
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]            \n\t"
+    // d
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp4]            \n\t"
+    // b
+    "paddh      %[ftmp7],   %[ftmp2],       %[ftmp3]            \n\t"
+    // c
+    "psubh      %[ftmp8],   %[ftmp2],       %[ftmp3]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
+    "psubh      %[ftmp3],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp4],   %[ftmp6],       %[ftmp8]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp1],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    TRANSPOSE_4H
+    "gssdlc1    %[ftmp1],   0x07(%[op])                         \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[op])                         \n\t"
+    "gssdlc1    %[ftmp2],   0x0f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp2],   0x08(%[op])                         \n\t"
+    "gssdlc1    %[ftmp3],   0x17(%[op])                         \n\t"
+    "gssdrc1    %[ftmp3],   0x10(%[op])                         \n\t"
+    "gssdlc1    %[ftmp4],   0x1f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp4],   0x18(%[op])                         \n\t"
+    : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+      [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+      [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
+      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0])
+    : [ip]"r"(input), [op]"r"(output), [ff_ph_03]"f"(ff_ph_03)
+    : "memory"
+  );
+
+  for (i = 0; i < 16; i++) {
+    mb_dqcoeff[i * 16] = output[i];
+  }
+}
diff --git a/libs/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/libs/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
new file mode 100644
index 0000000000..f2182f95c9
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -0,0 +1,1337 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+DECLARE_ALIGNED(8, static const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+DECLARE_ALIGNED(8, static const uint64_t,
+                ff_ph_003f) = { 0x003f003f003f003fULL };
+DECLARE_ALIGNED(8, static const uint64_t,
+                ff_ph_0900) = { 0x0900090009000900ULL };
+DECLARE_ALIGNED(8, static const uint64_t,
+                ff_ph_1200) = { 0x1200120012001200ULL };
+DECLARE_ALIGNED(8, static const uint64_t,
+                ff_ph_1b00) = { 0x1b001b001b001b00ULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_fe) = { 0xfefefefefefefefeULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_80) = { 0x8080808080808080ULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_04) = { 0x0404040404040404ULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_03) = { 0x0303030303030303ULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_01) = { 0x0101010101010101ULL };
+
+void vp8_loop_filter_horizontal_edge_mmi(
+    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+    const unsigned char *limit, const unsigned char *thresh, int count) {
+  uint32_t tmp[1];
+  mips_reg addr[2];
+  double ftmp[12];
+  __asm__ volatile (
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp10],  0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp10],  0x00(%[limit])                          \n\t"
+
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+    "gsldlc1    %[ftmp1],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+
+    MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+    "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp0],   %[ftmp1],           %[ftmp3]            \n\t"
+    "psubusb    %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp3],           %[ftmp4]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp9],   %[ftmp4],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp9],           %[ftmp10]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
+    "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
+    "pasubub    %[ftmp11],  %[ftmp7],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp11],          %[ftmp10]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp8],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp8],           %[ftmp7]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp2],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "pasubub    %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "and        %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp10]           \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "gsldlc1    %[ftmp10],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp10],  0x00(%[blimit])                         \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "xor        %[ftmp10],  %[ftmp10],          %[ftmp10]           \n\t"
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+
+    "gsldlc1    %[ftmp10],  0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp10],  0x00(%[thresh])                         \n\t"
+    "psubusb    %[ftmp1],   %[ftmp9],           %[ftmp10]           \n\t"
+    "psubusb    %[ftmp2],   %[ftmp11],          %[ftmp10]           \n\t"
+    "paddb      %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "xor        %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    "xor        %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+
+    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+
+    "psubsb     %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "and        %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp3],   %[ftmp6],           %[ftmp5]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "and        %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+
+    "paddsb     %[ftmp8],   %[ftmp2],           %[ff_pb_03]         \n\t"
+    "paddsb     %[ftmp9],   %[ftmp2],           %[ff_pb_04]         \n\t"
+
+    "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+    "xor        %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp8]            \n\t"
+
+    "li         %[tmp0],    0x0b                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+    "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"
+    "packsshb   %[ftmp8],   %[ftmp0],           %[ftmp11]           \n\t"
+    "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+    "xor        %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
+    "punpckhbh  %[ftmp9],   %[ftmp11],          %[ftmp9]            \n\t"
+    "psrah      %[ftmp9],   %[ftmp9],           %[ftmp10]           \n\t"
+    "paddsh     %[ftmp11],  %[ftmp0],           %[ff_ph_01]         \n\t"
+    "packsshb   %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "paddsh     %[ftmp9],   %[ftmp9],           %[ff_ph_01]         \n\t"
+
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"
+    "psrah      %[ftmp9],   %[ftmp9],           %[ftmp10]           \n\t"
+    "packsshb   %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t"
+    "pandn      %[ftmp1],   %[ftmp1],           %[ftmp11]           \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp8]            \n\t"
+    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp1]            \n\t"
+    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
+
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
+    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
+    "gssdrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
+
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+    : [limit]"r"(limit),                [blimit]"r"(blimit),
+      [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
+      [ff_ph_01]"f"(ff_ph_01),          [ff_pb_fe]"f"(ff_pb_fe),
+      [ff_pb_80]"f"(ff_pb_80),          [ff_pb_04]"f"(ff_pb_04),
+      [ff_pb_03]"f"(ff_pb_03)
+    : "memory"
+  );
+}
+
+void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
+                                       int src_pixel_step,
+                                       const unsigned char *blimit,
+                                       const unsigned char *limit,
+                                       const unsigned char *thresh, int count) {
+  uint32_t tmp[1];
+  mips_reg addr[2];
+  double ftmp[13];
+
+  __asm__ volatile (
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
+
+    "1:                                                             \n\t"
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+    MMI_SLL (%[tmp0], %[src_pixel_step], 0x01)
+    MMI_ADDU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr1])                          \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+    "gsldlc1    %[ftmp12],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp2],   %[ftmp11],          %[ftmp12]           \n\t"
+
+    "gsldlc1    %[ftmp11],  0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[src_ptr])                        \n\t"
+    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
+    "punpcklbh  %[ftmp3],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp4],   %[ftmp11],          %[ftmp12]           \n\t"
+
+    "punpcklhw  %[ftmp5],   %[ftmp4],           %[ftmp2]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp4],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp7],   %[ftmp3],           %[ftmp1]            \n\t"
+    "punpckhhw  %[ftmp8],   %[ftmp3],           %[ftmp1]            \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp12],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp11],          %[ftmp12]           \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr1])                          \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+    "gsldlc1    %[ftmp12],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp12]           \n\t"
+
+    "punpcklhw  %[ftmp1],   %[ftmp11],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp2],   %[ftmp11],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp3],   %[ftmp0],           %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp0],           %[ftmp9]            \n\t"
+
+    /* ftmp9:q0  ftmp10:q1 */
+    "punpcklwd  %[ftmp9],   %[ftmp1],           %[ftmp5]            \n\t"
+    "punpckhwd  %[ftmp10],  %[ftmp1],           %[ftmp5]            \n\t"
+    /* ftmp11:q2  ftmp12:q3 */
+    "punpcklwd  %[ftmp11],  %[ftmp2],           %[ftmp6]            \n\t"
+    "punpckhwd  %[ftmp12],  %[ftmp2],           %[ftmp6]            \n\t"
+    /* ftmp1:p3  ftmp2:p2 */
+    "punpcklwd  %[ftmp1],   %[ftmp3],           %[ftmp7]            \n\t"
+    "punpckhwd  %[ftmp2],   %[ftmp3],           %[ftmp7]            \n\t"
+    /* ftmp5:p1  ftmp6:p0 */
+    "punpcklwd  %[ftmp5],   %[ftmp4],           %[ftmp8]            \n\t"
+    "punpckhwd  %[ftmp6],   %[ftmp4],           %[ftmp8]            \n\t"
+
+    "gsldlc1    %[ftmp8],   0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[limit])                          \n\t"
+
+    /* abs (q3-q2) */
+    "pasubub    %[ftmp7],   %[ftmp12],          %[ftmp11]           \n\t"
+    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp8]            \n\t"
+    /* abs (q2-q1) */
+    "pasubub    %[ftmp7],   %[ftmp11],          %[ftmp10]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp3: abs(q1-q0) */
+    "pasubub    %[ftmp3],   %[ftmp10],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp4: abs(p1-p0) */
+    "pasubub    %[ftmp4],   %[ftmp5],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p2-p1) */
+    "pasubub    %[ftmp7],   %[ftmp2],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p3-p2) */
+    "pasubub    %[ftmp7],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+
+    "gsldlc1    %[ftmp8],   0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[blimit])                         \n\t"
+
+    /* abs (p0-q0) */
+    "pasubub    %[ftmp11],  %[ftmp9],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
+    /* abs (p1-q1) */
+    "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
+    "and        %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+    "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp1]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "xor        %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* ftmp0:mask */
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "gsldlc1    %[ftmp8],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[thresh])                         \n\t"
+
+    /* ftmp3: abs(q1-q0)  ftmp4: abs(p1-p0) */
+    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp8]            \n\t"
+    "or         %[ftmp2],   %[ftmp4],           %[ftmp3]            \n\t"
+    "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* ftmp1:hev */
+    "xor        %[ftmp1],   %[ftmp2],           %[ftmp1]            \n\t"
+
+    "xor        %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "xor        %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+
+    "psubsb     %[ftmp2],   %[ftmp5],           %[ftmp10]           \n\t"
+    "and        %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp3],   %[ftmp9],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    /* ftmp2:filter_value */
+    "and        %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+
+    "paddsb     %[ftmp11],  %[ftmp2],           %[ff_pb_04]         \n\t"
+    "paddsb     %[ftmp12],  %[ftmp2],           %[ff_pb_03]         \n\t"
+
+    "li         %[tmp0],    0x0b                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp7]                                \n\t"
+    "xor       %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"
+    "xor       %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"
+    "punpcklbh %[ftmp0],    %[ftmp0],           %[ftmp12]           \n\t"
+    "punpckhbh %[ftmp8],    %[ftmp8],           %[ftmp12]           \n\t"
+    "psrah     %[ftmp0],    %[ftmp0],           %[ftmp7]            \n\t"
+    "psrah     %[ftmp8],    %[ftmp8],           %[ftmp7]            \n\t"
+    "packsshb  %[ftmp12],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "xor       %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"
+    "xor       %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"
+    "punpcklbh %[ftmp0],    %[ftmp0],           %[ftmp11]           \n\t"
+    "punpckhbh %[ftmp8],    %[ftmp8],           %[ftmp11]           \n\t"
+    "psrah     %[ftmp0],    %[ftmp0],           %[ftmp7]            \n\t"
+    "psrah     %[ftmp8],    %[ftmp8],           %[ftmp7]            \n\t"
+    "packsshb  %[ftmp11],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "psubsb     %[ftmp9],   %[ftmp9],           %[ftmp11]           \n\t"
+    "xor        %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp12]           \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "paddsh     %[ftmp0],   %[ftmp0],           %[ff_ph_01]         \n\t"
+    "paddsh     %[ftmp8],   %[ftmp8],           %[ff_ph_01]         \n\t"
+
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp7]                                \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp7]            \n\t"
+    "packsshb   %[ftmp2],   %[ftmp0],           %[ftmp8]            \n\t"
+    "pandn      %[ftmp2],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubsb     %[ftmp10],  %[ftmp10],          %[ftmp2]            \n\t"
+    "xor        %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
+    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+
+    /* ftmp5: *op1 ; ftmp6: *op0 */
+    "punpcklbh  %[ftmp2],   %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
+    /* ftmp9: *oq0 ; ftmp10: *oq1 */
+    "punpcklbh  %[ftmp4],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp2],           %[ftmp4]            \n\t"
+    "punpcklhw  %[ftmp2],   %[ftmp2],           %[ftmp4]            \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp1],           %[ftmp3]            \n\t"
+    "punpcklhw  %[ftmp1],   %[ftmp1],           %[ftmp3]            \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsswlc1    %[ftmp2],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp2],   0x02(%[addr1])                          \n\t"
+
+    "li         %[tmp0],    0x20                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dsrl       %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+    "gsswlc1    %[ftmp2],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp2],   0x02(%[addr1])                          \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsswlc1    %[ftmp6],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x02(%[addr1])                          \n\t"
+
+    "dsrl       %[ftmp6],   %[ftmp6],           %[ftmp9]            \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsswlc1    %[ftmp6],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x02(%[addr1])                          \n\t"
+    "gsswlc1    %[ftmp1],   0x05(%[src_ptr])                        \n\t"
+    "gsswrc1    %[ftmp1],   0x02(%[src_ptr])                        \n\t"
+
+    "dsrl       %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "gsswlc1    %[ftmp1],   0x05(%[addr0])                          \n\t"
+    "gsswrc1    %[ftmp1],   0x02(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
+    "gsswlc1    %[ftmp5],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x02(%[addr1])                          \n\t"
+
+    "dsrl       %[ftmp5],   %[ftmp5],           %[ftmp9]            \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+    "gsswlc1    %[ftmp5],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x02(%[addr1])                          \n\t"
+
+    MMI_ADDIU(%[count], %[count], -0x01)
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+    : [limit]"r"(limit),                [blimit]"r"(blimit),
+      [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [ff_ph_01]"f"(ff_ph_01),          [ff_pb_03]"f"(ff_pb_03),
+      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_80]"f"(ff_pb_80),
+      [ff_pb_fe]"f"(ff_pb_fe)
+    : "memory"
+  );
+}
+
+/* clang-format off */
+#define VP8_MBLOOP_HPSRAB                                               \
+  "punpcklbh  %[ftmp10],  %[ftmp10],          %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp0]            \n\t" \
+  "psrah      %[ftmp10],  %[ftmp10],          %[ftmp9]            \n\t" \
+  "psrah      %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t" \
+  "packsshb   %[ftmp0],   %[ftmp10],          %[ftmp11]            \n\t"
+
+#define VP8_MBLOOP_HPSRAB_ADD(reg)                                      \
+  "punpcklbh  %[ftmp1],   %[ftmp0],           %[ftmp12]           \n\t" \
+  "punpckhbh  %[ftmp2],   %[ftmp0],           %[ftmp12]           \n\t" \
+  "pmulhh     %[ftmp1],   %[ftmp1],         " #reg "              \n\t" \
+  "pmulhh     %[ftmp2],   %[ftmp2],         " #reg "              \n\t" \
+  "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_003f]       \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],           %[ff_ph_003f]       \n\t" \
+  "psrah      %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t" \
+  "psrah      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t" \
+  "packsshb   %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+/* clang-format on */
+
+void vp8_mbloop_filter_horizontal_edge_mmi(
+    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+    const unsigned char *limit, const unsigned char *thresh, int count) {
+  uint32_t tmp[1];
+  double ftmp[13];
+
+  __asm__ volatile (
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp9],   0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp9],   0x00(%[limit])                          \n\t"
+    /* ftmp1: p3 */
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp3: p2 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp4: p1 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp5: p0 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp6: q0 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp7: q1 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp8: q2 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp2: q3 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp2],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[src_ptr])                        \n\t"
+
+    "gsldlc1    %[ftmp12],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[blimit])                         \n\t"
+
+    "pasubub    %[ftmp0],   %[ftmp1],           %[ftmp3]            \n\t"
+    "psubusb    %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "pasubub    %[ftmp1],   %[ftmp3],           %[ftmp4]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp10],  %[ftmp4],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp11],  %[ftmp7],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp11],          %[ftmp9]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp1],   %[ftmp8],           %[ftmp7]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp1],   %[ftmp2],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "pasubub    %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "and        %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp12]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "xor        %[ftmp9],   %[ftmp9],           %[ftmp9]            \n\t"
+    /* ftmp0: mask */
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+
+    "gsldlc1    %[ftmp9],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp9],   0x00(%[thresh])                         \n\t"
+    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp2],   %[ftmp11],          %[ftmp9]            \n\t"
+    "paddb      %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "xor        %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    /* ftmp1: hev */
+    "xor        %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+
+    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "psubsb     %[ftmp9],   %[ftmp6],           %[ftmp5]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "and        %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "pandn      %[ftmp12],  %[ftmp1],           %[ftmp2]            \n\t"
+    "and        %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+
+    "li         %[tmp0],    0x0b                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_03]         \n\t"
+    VP8_MBLOOP_HPSRAB
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_04]         \n\t"
+    VP8_MBLOOP_HPSRAB
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+
+    "li         %[tmp0],    0x07                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+
+    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp1]            \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp1]            \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    "gssdlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
+    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200])
+    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+
+    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900])
+    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "gssdlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
+      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count)
+    : [limit]"r"(limit),                  [blimit]"r"(blimit),
+      [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [ff_pb_fe]"f"(ff_pb_fe),            [ff_pb_80]"f"(ff_pb_80),
+      [ff_pb_04]"f"(ff_pb_04),            [ff_pb_03]"f"(ff_pb_03),
+      [ff_ph_0900]"f"(ff_ph_0900),        [ff_ph_1b00]"f"(ff_ph_1b00),
+      [ff_ph_1200]"f"(ff_ph_1200),        [ff_ph_003f]"f"(ff_ph_003f)
+    : "memory"
+  );
+}
+
+#define VP8_MBLOOP_VPSRAB_ADDH                                          \
+  "xor        %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t" \
+  "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
+  "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp0]            \n\t"
+
+#define VP8_MBLOOP_VPSRAB_ADDT                                          \
+  "paddh      %[ftmp7],   %[ftmp7],           %[ff_ph_003f]       \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],           %[ff_ph_003f]       \n\t" \
+  "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t" \
+  "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t" \
+  "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
+
+void vp8_mbloop_filter_vertical_edge_mmi(
+    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+    const unsigned char *limit, const unsigned char *thresh, int count) {
+  mips_reg tmp[1];
+  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
+  double ftmp[14];
+
+  __asm__ volatile (
+    MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
+
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"
+
+    "punpcklhw  %[ftmp1],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp2],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp3],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp11],          %[ftmp9]            \n\t"
+
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"
+
+    "punpcklhw  %[ftmp5],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp7],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp8],   %[ftmp11],          %[ftmp9]            \n\t"
+
+    "gsldlc1    %[ftmp13],  0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp13],  0x00(%[limit])                          \n\t"
+    /* ftmp9:q0  ftmp10:q1 */
+    "punpcklwd  %[ftmp9],   %[ftmp1],           %[ftmp5]            \n\t"
+    "punpckhwd  %[ftmp10],  %[ftmp1],           %[ftmp5]            \n\t"
+    /* ftmp11:q2  ftmp12:q3 */
+    "punpcklwd  %[ftmp11],  %[ftmp2],           %[ftmp6]            \n\t"
+    "punpckhwd  %[ftmp12],  %[ftmp2],           %[ftmp6]            \n\t"
+    /* srct[0x00]: q3 */
+    "sdc1       %[ftmp12],  0x00(%[srct])                           \n\t"
+    /* ftmp1:p3  ftmp2:p2 */
+    "punpcklwd  %[ftmp1],   %[ftmp3],           %[ftmp7]            \n\t"
+    "punpckhwd  %[ftmp2],   %[ftmp3],           %[ftmp7]            \n\t"
+    /* srct[0x08]: p3 */
+    "sdc1       %[ftmp1],   0x08(%[srct])                           \n\t"
+    /* ftmp5:p1  ftmp6:p0 */
+    "punpcklwd  %[ftmp5],   %[ftmp4],           %[ftmp8]            \n\t"
+    "punpckhwd  %[ftmp6],   %[ftmp4],           %[ftmp8]            \n\t"
+
+    /* abs (q3-q2) */
+    "pasubub    %[ftmp7],   %[ftmp12],          %[ftmp11]           \n\t"
+    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp13]           \n\t"
+    /* abs (q2-q1) */
+    "pasubub    %[ftmp7],   %[ftmp11],          %[ftmp10]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp3: abs(q1-q0) */
+    "pasubub    %[ftmp3],   %[ftmp10],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp13]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp4: abs(p1-p0) */
+    "pasubub    %[ftmp4],   %[ftmp5],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp13]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p2-p1) */
+    "pasubub    %[ftmp7],   %[ftmp2],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p3-p2) */
+    "pasubub    %[ftmp7],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+
+    "gsldlc1    %[ftmp13],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp13],  0x00(%[blimit])                         \n\t"
+    "gsldlc1    %[ftmp7],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[thresh])                         \n\t"
+    /* abs (p0-q0) * 2 */
+    "pasubub    %[ftmp1],   %[ftmp9],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* abs (p1-q1) / 2 */
+    "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
+    "and        %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+    "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
+    "paddusb    %[ftmp12],  %[ftmp1],           %[ftmp12]           \n\t"
+    "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp13]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
+    "xor        %[ftmp12],  %[ftmp12],          %[ftmp12]           \n\t"
+    /* ftmp0: mask */
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
+
+    /* abs(p1-p0) - thresh */
+    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp7]            \n\t"
+    /* abs(q1-q0) - thresh */
+    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp7]            \n\t"
+    "or         %[ftmp3],   %[ftmp4],           %[ftmp3]            \n\t"
+    "pcmpeqb    %[ftmp3],   %[ftmp3],           %[ftmp12]           \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* ftmp1: hev */
+    "xor        %[ftmp1],   %[ftmp3],           %[ftmp1]            \n\t"
+
+    /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */
+    "xor        %[ftmp11],  %[ftmp11],          %[ff_pb_80]         \n\t"
+    "xor        %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "xor        %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp2],   %[ftmp2],           %[ff_pb_80]         \n\t"
+
+    "psubsb     %[ftmp3],   %[ftmp5],           %[ftmp10]           \n\t"
+    "psubsb     %[ftmp4],   %[ftmp9],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+    /* filter_value &= mask */
+    "and        %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
+    /* Filter2 = filter_value & hev */
+    "and        %[ftmp3],   %[ftmp1],           %[ftmp0]            \n\t"
+    /* filter_value &= ~hev */
+    "pandn      %[ftmp0],   %[ftmp1],           %[ftmp0]            \n\t"
+
+    "paddsb     %[ftmp4],   %[ftmp3],           %[ff_pb_04]         \n\t"
+    "li         %[tmp0],    0x0b                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"
+    "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp4]            \n\t"
+    "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp4]            \n\t"
+    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
+    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
+    "packsshb   %[ftmp4],   %[ftmp7],           %[ftmp8]            \n\t"
+    /* ftmp9: qs0 */
+    "psubsb     %[ftmp9],   %[ftmp9],           %[ftmp4]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ff_pb_03]         \n\t"
+    "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
+    "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp3]            \n\t"
+    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
+    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
+    "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
+    /* ftmp6: ps0 */
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"
+
+    "li         %[tmp0],    0x07                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"
+    VP8_MBLOOP_VPSRAB_ADDH
+    "paddh      %[ftmp1],   %[ff_ph_0900],      %[ff_ph_0900]       \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_0900]       \n\t"
+    "pmulhh     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    VP8_MBLOOP_VPSRAB_ADDT
+    "psubsb     %[ftmp4],   %[ftmp9],           %[ftmp3]            \n\t"
+    /* ftmp9: oq0 */
+    "xor        %[ftmp9],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp4],   %[ftmp6],           %[ftmp3]            \n\t"
+    /* ftmp6: op0 */
+    "xor        %[ftmp6],   %[ftmp4],           %[ff_pb_80]         \n\t"
+
+    VP8_MBLOOP_VPSRAB_ADDH
+    "paddh      %[ftmp1],   %[ff_ph_0900],      %[ff_ph_0900]       \n\t"
+    "pmulhh     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    VP8_MBLOOP_VPSRAB_ADDT
+    "psubsb     %[ftmp4],   %[ftmp10],          %[ftmp3]            \n\t"
+    /* ftmp10: oq1 */
+    "xor        %[ftmp10],   %[ftmp4],          %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp4],   %[ftmp5],           %[ftmp3]            \n\t"
+    /* ftmp5: op1 */
+    "xor        %[ftmp5],   %[ftmp4],           %[ff_pb_80]         \n\t"
+
+    VP8_MBLOOP_VPSRAB_ADDH
+    "pmulhh     %[ftmp7],   %[ftmp7],           %[ff_ph_0900]       \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp8],           %[ff_ph_0900]       \n\t"
+    VP8_MBLOOP_VPSRAB_ADDT
+    "psubsb     %[ftmp4],   %[ftmp11],          %[ftmp3]            \n\t"
+    /* ftmp11: oq2 */
+    "xor        %[ftmp11],  %[ftmp4],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp4],   %[ftmp2],           %[ftmp3]            \n\t"
+    /* ftmp2: op2 */
+    "xor        %[ftmp2],   %[ftmp4],           %[ff_pb_80]         \n\t"
+
+    "ldc1       %[ftmp12],  0x00(%[srct])                           \n\t"
+    "ldc1       %[ftmp8],   0x08(%[srct])                           \n\t"
+
+    "punpcklbh  %[ftmp0],   %[ftmp8],           %[ftmp2]            \n\t"
+    "punpckhbh  %[ftmp1],   %[ftmp8],           %[ftmp2]            \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklhw  %[ftmp4],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp6],   %[ftmp1],           %[ftmp3]            \n\t"
+    "punpckhhw  %[ftmp7],   %[ftmp1],           %[ftmp3]            \n\t"
+
+    "punpcklbh  %[ftmp0],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpckhbh  %[ftmp1],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpcklhw  %[ftmp8],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpckhhw  %[ftmp9],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp10],  %[ftmp1],           %[ftmp3]            \n\t"
+    "punpckhhw  %[ftmp11],  %[ftmp1],           %[ftmp3]            \n\t"
+
+    "punpcklwd  %[ftmp0],   %[ftmp7],           %[ftmp11]           \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp7],           %[ftmp11]           \n\t"
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklwd  %[ftmp0],   %[ftmp6],           %[ftmp10]           \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp6],           %[ftmp10]           \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklwd  %[ftmp1],   %[ftmp5],           %[ftmp9]            \n\t"
+    "punpckhwd  %[ftmp0],   %[ftmp5],           %[ftmp9]            \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklwd  %[ftmp1],   %[ftmp4],           %[ftmp8]            \n\t"
+    "punpckhwd  %[ftmp0],   %[ftmp4],           %[ftmp8]            \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),            [ftmp13]"=&f"(ftmp[13]),
+      [tmp0]"=&r"(tmp[0]),                [src_ptr]"+&r"(src_ptr),
+      [count]"+&r"(count)
+    : [limit]"r"(limit),                [blimit]"r"(blimit),
+      [srct]"r"(srct),                  [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [ff_ph_003f]"f"(ff_ph_003f),      [ff_ph_0900]"f"(ff_ph_0900),
+      [ff_pb_03]"f"(ff_pb_03),          [ff_pb_04]"f"(ff_pb_04),
+      [ff_pb_80]"f"(ff_pb_80),          [ff_pb_fe]"f"(ff_pb_fe)
+    : "memory"
+  );
+}
+
+#define VP8_SIMPLE_HPSRAB                                               \
+  "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t" \
+  "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t" \
+  "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t" \
+  "psrah      %[ftmp1],   %[ftmp5],           %[ftmp10]           \n\t" \
+  "psllh      %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t" \
+  "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
+                                                int src_pixel_step,
+                                                const unsigned char *blimit) {
+  uint32_t tmp[1], count = 2;
+  mips_reg addr[2];
+  double ftmp[12];
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x08                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+    "li         %[tmp0],    0x03                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "li         %[tmp0],    0x0b                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                               \n\t"
+
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp3],   0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[blimit])                         \n\t"
+
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+    "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp7],           %[ftmp2]            \n\t"
+    "and        %[ftmp1],   %[ftmp1],           %[ff_pb_fe]         \n\t"
+    "psrlh      %[ftmp1],   %[ftmp1],           %[ftmp11]           \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+    "gsldlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    "pasubub    %[ftmp5],   %[ftmp6],           %[ftmp0]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp5]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp1]            \n\t"
+    "psubusb    %[ftmp5],   %[ftmp5],           %[ftmp3]            \n\t"
+    "xor        %[ftmp3],   %[ftmp3],           %[ftmp3]            \n\t"
+    "pcmpeqb    %[ftmp5],   %[ftmp5],           %[ftmp3]            \n\t"
+
+    "xor        %[ftmp2],   %[ftmp2],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp2],   %[ftmp2],           %[ftmp7]            \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp3],   %[ftmp0],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp0],   %[ftmp3],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "and        %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
+
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"
+    VP8_SIMPLE_HPSRAB
+    "psubsb     %[ftmp3],   %[ftmp3],           %[ftmp0]            \n\t"
+    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+
+    "psubsb     %[ftmp5],   %[ftmp5],           %[ff_pb_01]         \n\t"
+    VP8_SIMPLE_HPSRAB
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp6],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+    : [blimit]"r"(blimit),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),
+      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_01]"f"(ff_pb_01)
+    : "memory"
+  );
+}
+
+void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
+                                              int src_pixel_step,
+                                              const unsigned char *blimit) {
+  uint32_t tmp[1], count = 2;
+  mips_reg addr[2];
+  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
+  double ftmp[12];
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x08                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+    "li         %[tmp0],    0x20                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4])
+    MMI_SUBU(%[src_ptr], %[src_ptr], 0x02)
+
+    "1:                                                             \n\t"
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+    "gslwlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gslwlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gslwlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    "gslwlc1    %[ftmp4],   0x03(%[src_ptr])                        \n\t"
+    "gslwrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklbh  %[ftmp4],   %[ftmp4],           %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp4],           %[ftmp6]            \n\t"
+    "punpcklhw  %[ftmp4],   %[ftmp4],           %[ftmp6]            \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gslwlc1    %[ftmp7],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp7],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gslwlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp6],   %[ftmp6],           %[ftmp7]            \n\t"
+
+    MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+    "gslwlc1    %[ftmp1],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+    "gslwlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "punpckhhw  %[ftmp2],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpcklhw  %[ftmp0],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp0],           %[ftmp4]            \n\t"
+    "punpcklwd  %[ftmp0],   %[ftmp0],           %[ftmp4]            \n\t"
+    "punpckhwd  %[ftmp3],   %[ftmp2],           %[ftmp5]            \n\t"
+    "punpcklwd  %[ftmp2],   %[ftmp2],           %[ftmp5]            \n\t"
+
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "pasubub    %[ftmp6],   %[ftmp3],           %[ftmp0]            \n\t"
+    "and        %[ftmp6],   %[ftmp6],           %[ff_pb_fe]         \n\t"
+    "psrlh      %[ftmp6],   %[ftmp6],           %[ftmp9]            \n\t"
+    "pasubub    %[ftmp5],   %[ftmp1],           %[ftmp2]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp5]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp6]            \n\t"
+
+    "gsldlc1    %[ftmp7],   0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[blimit])                         \n\t"
+    "psubusb    %[ftmp5],   %[ftmp5],           %[ftmp7]            \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t"
+    "pcmpeqb    %[ftmp5],   %[ftmp5],           %[ftmp7]            \n\t"
+
+    "sdc1       %[ftmp0],   0x00(%[srct])                           \n\t"
+    "sdc1       %[ftmp3],   0x08(%[srct])                           \n\t"
+
+    "xor        %[ftmp0],   %[ftmp0],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
+
+    "xor        %[ftmp6],   %[ftmp1],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp3],   %[ftmp2],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp7],   %[ftmp3],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "and        %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"
+
+    "li         %[tmp0],    0x03                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "li         %[tmp0],    0x0b                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "psrah      %[ftmp7],   %[ftmp5],           %[ftmp9]            \n\t"
+    "psllh      %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "psubsb     %[ftmp3],   %[ftmp3],           %[ftmp0]            \n\t"
+    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp5],   %[ftmp5],           %[ff_pb_01]         \n\t"
+
+    "li         %[tmp0],    0x03                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "li         %[tmp0],    0x0b                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "psrah      %[ftmp5],   %[ftmp5],           %[ftmp9]            \n\t"
+    "psllh      %[ftmp5],   %[ftmp5],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp5]            \n\t"
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+
+    "ldc1       %[ftmp0],   0x00(%[srct])                           \n\t"
+    "ldc1       %[ftmp4],   0x08(%[srct])                           \n\t"
+
+    "punpckhbh  %[ftmp1],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp3],           %[ftmp4]            \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+
+    "punpckhhw  %[ftmp6],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp0],   %[ftmp0],           %[ftmp2]            \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+    "gsswlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp1],           %[ftmp3]            \n\t"
+    "punpcklhw  %[ftmp1],   %[ftmp1],           %[ftmp3]            \n\t"
+
+    "dsrl       %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+    MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+    "gsswlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsswlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+
+    "dsrl       %[ftmp6],   %[ftmp6],           %[ftmp10]           \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[src_ptr])                        \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsswlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsswlc1    %[ftmp5],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+
+    "dsrl       %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[addr0])                          \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+
+    "dsrl       %[ftmp5],   %[ftmp5],           %[ftmp10]           \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+    "gsswlc1    %[ftmp5],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x8])
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+    : [blimit]"r"(blimit),              [srct]"r"(srct),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
+      [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)),
+      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),
+      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_01]"f"(ff_pb_01)
+    : "memory"
+  );
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                             unsigned char *v_ptr, int y_stride, int uv_stride,
+                             loop_filter_info *lfi) {
+  vp8_mbloop_filter_horizontal_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_mbloop_filter_horizontal_edge_mmi(u_ptr, uv_stride, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_mbloop_filter_horizontal_edge_mmi(v_ptr, uv_stride, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                             unsigned char *v_ptr, int y_stride, int uv_stride,
+                             loop_filter_info *lfi) {
+  vp8_mbloop_filter_vertical_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_mbloop_filter_vertical_edge_mmi(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_mbloop_filter_vertical_edge_mmi(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                            unsigned char *v_ptr, int y_stride, int uv_stride,
+                            loop_filter_info *lfi) {
+  vp8_loop_filter_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, lfi->blim,
+                                      lfi->lim, lfi->hev_thr, 2);
+  vp8_loop_filter_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, lfi->blim,
+                                      lfi->lim, lfi->hev_thr, 2);
+  vp8_loop_filter_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
+                                      lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_loop_filter_horizontal_edge_mmi(u_ptr + 4 * uv_stride, uv_stride,
+                                        lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_loop_filter_horizontal_edge_mmi(v_ptr + 4 * uv_stride, uv_stride,
+                                        lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                            unsigned char *v_ptr, int y_stride, int uv_stride,
+                            loop_filter_info *lfi) {
+  vp8_loop_filter_vertical_edge_mmi(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+                                    lfi->hev_thr, 2);
+  vp8_loop_filter_vertical_edge_mmi(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+                                    lfi->hev_thr, 2);
+  vp8_loop_filter_vertical_edge_mmi(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+                                    lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_loop_filter_vertical_edge_mmi(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                      lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_loop_filter_vertical_edge_mmi(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                      lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride,
+                             const unsigned char *blimit) {
+  vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride,
+                                             blimit);
+  vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride,
+                                             blimit);
+  vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
+                                             blimit);
+}
+
+void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride,
+                             const unsigned char *blimit) {
+  vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 4, y_stride, blimit);
+  vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 8, y_stride, blimit);
+  vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 12, y_stride, blimit);
+}
diff --git a/libs/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c b/libs/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c
new file mode 100644
index 0000000000..77d665d456
--- /dev/null
+++ b/libs/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c
@@ -0,0 +1,416 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/filter.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = {
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
+    0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002,
+    0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
+    0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
+    0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
+    0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
+    0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
+    0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
+    0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003,
+    0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
+    0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
+    0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
+    0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
+    0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
+    0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
+    0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
+    0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
+    0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
+    0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
+    0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
+    0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
+    0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }
+};
+
+/* Horizontal filter:  pixel_step is 1, output_height and output_width are
+   the size of horizontal filtering output, output_height is always H + 5 */
+static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
+                                             uint16_t *output_ptr,
+                                             unsigned int src_pixels_per_line,
+                                             unsigned int output_height,
+                                             unsigned int output_width,
+                                             const int16_t *vp8_filter) {
+  uint32_t tmp[1];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+  register double ftmp2 asm("$f6");
+  register double ftmp3 asm("$f8");
+  register double ftmp4 asm("$f10");
+  register double ftmp5 asm("$f12");
+  register double ftmp6 asm("$f14");
+  register double ftmp7 asm("$f16");
+  register double ftmp8 asm("$f18");
+  register double ftmp9 asm("$f20");
+  register double ftmp10 asm("$f22");
+  register double ftmp11 asm("$f24");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+  register double ftmp2 asm("$f3");
+  register double ftmp3 asm("$f4");
+  register double ftmp4 asm("$f5");
+  register double ftmp5 asm("$f6");
+  register double ftmp6 asm("$f7");
+  register double ftmp7 asm("$f8");
+  register double ftmp8 asm("$f9");
+  register double ftmp9 asm("$f10");
+  register double ftmp10 asm("$f11");
+  register double ftmp11 asm("$f12");
+#endif  // _MIPS_SIM == _ABIO32
+
+  __asm__ volatile (
+    "ldc1       %[ftmp0],       0x00(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp1],       0x10(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp2],       0x20(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp3],       0x30(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp4],       0x40(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp5],       0x50(%[vp8_filter])                   \n\t"
+    "xor        %[fzero],       %[fzero],           %[fzero]          \n\t"
+    "li         %[tmp0],        0x07                                  \n\t"
+    "mtc1       %[tmp0],        %[ftmp7]                              \n\t"
+    "li         %[tmp0],        0x08                                  \n\t"
+    "mtc1       %[tmp0],        %[ftmp11]                             \n\t"
+
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp9],       0x05(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp9],       -0x02(%[src_ptr])                     \n\t"
+    "gsldlc1    %[ftmp10],      0x06(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp10],      -0x01(%[src_ptr])                     \n\t"
+
+    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
+    "pmullh     %[ftmp8],       %[ftmp6],          %[ftmp0]           \n\t"
+
+    "punpckhbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp4]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp1]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "punpckhbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp5]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp2]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp3]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ff_ph_40]        \n\t"
+    "psrah      %[ftmp8],       %[ftmp8],          %[ftmp7]           \n\t"
+    "packushb   %[ftmp8],       %[ftmp8],          %[fzero]           \n\t"
+    "punpcklbh  %[ftmp8],       %[ftmp8],          %[fzero]           \n\t"
+    "gssdlc1    %[ftmp8],       0x07(%[output_ptr])                   \n\t"
+    "gssdrc1    %[ftmp8],       0x00(%[output_ptr])                   \n\t"
+
+    "addiu      %[output_height], %[output_height], -0x01             \n\t"
+    MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
+    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
+    "bnez       %[output_height],               1b                    \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [ftmp2]"=&f"(ftmp2),
+      [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
+      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
+      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
+      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
+      [ftmp11]"=&f"(ftmp11),            [tmp0]"=&r"(tmp[0]),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
+      [src_ptr]"+&r"(src_ptr)
+    : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
+      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width),
+      [ff_ph_40]"f"(ff_ph_40)
+    : "memory"
+    );
+}
+
+/* Horizontal filter:  pixel_step is always W */
+static INLINE void vp8_filter_block1dc_v6_mmi(
+    uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
+    int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) {
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+  uint32_t tmp[1];
+  mips_reg addr[1];
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+  register double ftmp2 asm("$f6");
+  register double ftmp3 asm("$f8");
+  register double ftmp4 asm("$f10");
+  register double ftmp5 asm("$f12");
+  register double ftmp6 asm("$f14");
+  register double ftmp7 asm("$f16");
+  register double ftmp8 asm("$f18");
+  register double ftmp9 asm("$f20");
+  register double ftmp10 asm("$f22");
+  register double ftmp11 asm("$f24");
+  register double ftmp12 asm("$f26");
+  register double ftmp13 asm("$f28");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+  register double ftmp2 asm("$f3");
+  register double ftmp3 asm("$f4");
+  register double ftmp4 asm("$f5");
+  register double ftmp5 asm("$f6");
+  register double ftmp6 asm("$f7");
+  register double ftmp7 asm("$f8");
+  register double ftmp8 asm("$f9");
+  register double ftmp9 asm("$f10");
+  register double ftmp10 asm("$f11");
+  register double ftmp11 asm("$f12");
+  register double ftmp12 asm("$f13");
+  register double ftmp13 asm("$f14");
+#endif  // _MIPS_SIM == _ABIO32
+
+  __asm__ volatile (
+    "ldc1       %[ftmp0],     0x00(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp1],     0x10(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp2],     0x20(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp3],     0x30(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp4],     0x40(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp5],     0x50(%[vp8_filter])                     \n\t"
+    "xor        %[fzero],     %[fzero],        %[fzero]               \n\t"
+    "li         %[tmp0],      0x07                                    \n\t"
+    "mtc1       %[tmp0],      %[ftmp13]                               \n\t"
+
+    /* In order to make full use of memory load delay slot,
+     * Operation of memory loading and calculating has been rearranged.
+     */
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp6],     0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],     0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line])
+    "gsldlc1    %[ftmp7],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp7],     0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
+    "gsldlc1    %[ftmp8],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp8],     0x00(%[addr0])                          \n\t"
+
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
+    "gsldlc1    %[ftmp9],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp9],     0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
+    "gsldlc1    %[ftmp10],    0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp10],    0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
+    "gsldlc1    %[ftmp11],    0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp11],    0x00(%[addr0])                          \n\t"
+
+    "pmullh     %[ftmp12],    %[ftmp6],        %[ftmp0]               \n\t"
+
+    "pmullh     %[ftmp7],     %[ftmp7],        %[ftmp1]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp7]               \n\t"
+
+    "pmullh     %[ftmp8],     %[ftmp8],        %[ftmp2]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp8]               \n\t"
+
+    "pmullh     %[ftmp9],     %[ftmp9],        %[ftmp4]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp9]               \n\t"
+
+    "pmullh     %[ftmp10],    %[ftmp10],       %[ftmp3]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp10]              \n\t"
+
+    "pmullh     %[ftmp11],    %[ftmp11],       %[ftmp5]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp11]              \n\t"
+
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ff_ph_40]            \n\t"
+    "psrah      %[ftmp12],    %[ftmp12],       %[ftmp13]              \n\t"
+    "packushb   %[ftmp12],    %[ftmp12],       %[fzero]               \n\t"
+    "gsswlc1    %[ftmp12],    0x03(%[output_ptr])                     \n\t"
+    "gsswrc1    %[ftmp12],    0x00(%[output_ptr])                     \n\t"
+
+    MMI_ADDIU(%[output_height], %[output_height], -0x01)
+    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
+    "bnez       %[output_height], 1b                                  \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [ftmp2]"=&f"(ftmp2),
+      [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
+      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
+      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
+      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
+      [ftmp11]"=&f"(ftmp11),            [ftmp12]"=&f"(ftmp12),
+      [ftmp13]"=&f"(ftmp13),            [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [src_ptr]"+&r"(src_ptr),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+    : [pixels_per_line]"r"((mips_reg)pixels_per_line),
+      [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
+      [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
+      [vp8_filter]"r"(vp8_filter),
+      [output_pitch]"r"((mips_reg)output_pitch),
+      [ff_ph_40]"f"(ff_ph_40)
+    : "memory"
+    );
+}
+
+/* When xoffset == 0, vp8_filter= {0,0,128,0,0,0},
+   function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can
+   be simplified */
+static INLINE void vp8_filter_block1d_h6_filter0_mmi(
+    unsigned char *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int output_height,
+    unsigned int output_width) {
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+#endif  // _MIPS_SIM == _ABIO32
+
+  __asm__ volatile (
+    "xor        %[fzero],       %[fzero],           %[fzero]          \n\t"
+
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp0],       0x07(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp0],       0x00(%[src_ptr])                      \n\t"
+    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
+
+    "punpcklbh  %[ftmp1],       %[ftmp0],          %[fzero]           \n\t"
+    "gssdlc1    %[ftmp1],       0x07(%[output_ptr])                   \n\t"
+    "gssdrc1    %[ftmp1],       0x00(%[output_ptr])                   \n\t"
+
+    "addiu      %[output_height], %[output_height], -0x01             \n\t"
+    MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
+    "bnez       %[output_height],               1b                    \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+    : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
+      [output_width]"r"(output_width)
+    : "memory"
+    );
+}
+
+static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
+    uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
+    int output_pitch, unsigned int pixels_per_line) {
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+#endif  // _MIPS_SIM == _ABIO32
+
+  __asm__ volatile (
+    "xor        %[fzero],     %[fzero],        %[fzero]               \n\t"
+
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp0],     0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp0],     0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
+    MMI_ADDIU(%[output_height], %[output_height], -0x01)
+    "packushb   %[ftmp1],     %[ftmp0],        %[fzero]               \n\t"
+    "gsswlc1    %[ftmp1],     0x03(%[output_ptr])                     \n\t"
+    "gsswrc1    %[ftmp1],     0x00(%[output_ptr])                     \n\t"
+
+    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
+    "bnez       %[output_height], 1b                                  \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+    : [pixels_per_line]"r"((mips_reg)pixels_per_line),
+      [output_pitch]"r"((mips_reg)output_pitch)
+    : "memory"
+    );
+}
+
+#define sixtapNxM(n, m)                                                        \
+  void vp8_sixtap_predict##n##x##m##_mmi(                                      \
+      unsigned char *src_ptr, int src_pixels_per_line, int xoffset,            \
+      int yoffset, unsigned char *dst_ptr, int dst_pitch) {                    \
+    DECLARE_ALIGNED(16, uint16_t,                                              \
+                    FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]);     \
+    const int16_t *HFilter, *VFilter;                                          \
+    int i, loop = n / 4;                                                       \
+    HFilter = vp8_six_tap_mmi[xoffset];                                        \
+    VFilter = vp8_six_tap_mmi[yoffset];                                        \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1d_h6_filter0_mmi(                                     \
+            src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4,       \
+            src_pixels_per_line, m + 5, n * 2);                                \
+      }                                                                        \
+    } else {                                                                   \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \
+                                  FData2 + i * 4, src_pixels_per_line, m + 5,  \
+                                  n * 2, HFilter);                             \
+      }                                                                        \
+    }                                                                          \
+    if (yoffset == 0) {                                                        \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1dc_v6_filter0_mmi(                                    \
+            FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2);     \
+      }                                                                        \
+    } else {                                                                   \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m,         \
+                                   dst_pitch, n * 2, VFilter);                 \
+      }                                                                        \
+    }                                                                          \
+  }
+
+sixtapNxM(4, 4);
+sixtapNxM(8, 8);
+sixtapNxM(8, 4);
+sixtapNxM(16, 16);
diff --git a/libs/libvpx/vp8/common/mips/msa/idct_msa.c b/libs/libvpx/vp8/common/mips/msa/idct_msa.c
index e1759c875e..3d516d0f81 100644
--- a/libs/libvpx/vp8/common/mips/msa/idct_msa.c
+++ b/libs/libvpx/vp8/common/mips/msa/idct_msa.c
@@ -90,8 +90,7 @@ static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred,
   v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
   v4i32 res0, res1, res2, res3;
   v16i8 zero = { 0 };
-  v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
-  v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
+  v16i8 pred0, pred1, pred2, pred3;
 
   LD_SH2(input, 8, input0, input1);
   UNPCK_SH_SW(input0, in0, in1);
@@ -111,20 +110,17 @@ static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred,
   res1 = CLIP_SW_0_255(res1);
   res2 = CLIP_SW_0_255(res2);
   res3 = CLIP_SW_0_255(res3);
-  LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
-  VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
-  VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
-  ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dest, dest_stride);
 }
 
 static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred,
                                  int32_t pred_stride, uint8_t *dest,
                                  int32_t dest_stride) {
-  v8i16 vec;
-  v8i16 res0, res1, res2, res3;
+  v8i16 vec, res0, res1, res2, res3, dst0, dst1;
   v16i8 zero = { 0 };
-  v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
-  v16i8 mask = { 0, 2, 4, 6, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
+  v16i8 pred0, pred1, pred2, pred3;
 
   vec = __msa_fill_h(in_dc);
   vec = __msa_srari_h(vec, 3);
@@ -133,55 +129,59 @@ static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred,
              res2, res3);
   ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
   CLIP_SH4_0_255(res0, res1, res2, res3);
-  LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
-  VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
-  VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
-  ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SH(res1, res0, res3, res2, dst0, dst1);
+  dst0 = (v8i16)__msa_pckev_w((v4i32)dst1, (v4i32)dst0);
+  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dest, dest_stride);
 }
 
 void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff) {
-  v8i16 input0, input1;
-  v4i32 in0, in1, in2, in3, a1, b1, c1, d1;
-  v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+  v8i16 input0, input1, tmp0, tmp1, tmp2, tmp3, out0, out1;
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
 
   LD_SH2(input, 8, input0, input1);
-  UNPCK_SH_SW(input0, in0, in1);
-  UNPCK_SH_SW(input1, in2, in3);
-  BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1);
-  BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2);
-  TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
-  BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1);
-  BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2);
-  ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3);
-  SRA_4V(vt0, vt1, vt2, vt3, 3);
-  mb_dq_coeff[0] = __msa_copy_s_h((v8i16)vt0, 0);
-  mb_dq_coeff[16] = __msa_copy_s_h((v8i16)vt1, 0);
-  mb_dq_coeff[32] = __msa_copy_s_h((v8i16)vt2, 0);
-  mb_dq_coeff[48] = __msa_copy_s_h((v8i16)vt3, 0);
-  mb_dq_coeff[64] = __msa_copy_s_h((v8i16)vt0, 2);
-  mb_dq_coeff[80] = __msa_copy_s_h((v8i16)vt1, 2);
-  mb_dq_coeff[96] = __msa_copy_s_h((v8i16)vt2, 2);
-  mb_dq_coeff[112] = __msa_copy_s_h((v8i16)vt3, 2);
-  mb_dq_coeff[128] = __msa_copy_s_h((v8i16)vt0, 4);
-  mb_dq_coeff[144] = __msa_copy_s_h((v8i16)vt1, 4);
-  mb_dq_coeff[160] = __msa_copy_s_h((v8i16)vt2, 4);
-  mb_dq_coeff[176] = __msa_copy_s_h((v8i16)vt3, 4);
-  mb_dq_coeff[192] = __msa_copy_s_h((v8i16)vt0, 6);
-  mb_dq_coeff[208] = __msa_copy_s_h((v8i16)vt1, 6);
-  mb_dq_coeff[224] = __msa_copy_s_h((v8i16)vt2, 6);
-  mb_dq_coeff[240] = __msa_copy_s_h((v8i16)vt3, 6);
+  input1 = (v8i16)__msa_sldi_b((v16i8)input1, (v16i8)input1, 8);
+  tmp0 = input0 + input1;
+  tmp1 = input0 - input1;
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  out0 = tmp2 + tmp3;
+  out1 = tmp2 - tmp3;
+  VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1);
+  tmp0 = input0 + input1;
+  tmp1 = input0 - input1;
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  tmp0 = tmp2 + tmp3;
+  tmp1 = tmp2 - tmp3;
+  ADD2(tmp0, 3, tmp1, 3, out0, out1);
+  out0 >>= 3;
+  out1 >>= 3;
+  mb_dq_coeff[0] = __msa_copy_s_h(out0, 0);
+  mb_dq_coeff[16] = __msa_copy_s_h(out0, 4);
+  mb_dq_coeff[32] = __msa_copy_s_h(out1, 0);
+  mb_dq_coeff[48] = __msa_copy_s_h(out1, 4);
+  mb_dq_coeff[64] = __msa_copy_s_h(out0, 1);
+  mb_dq_coeff[80] = __msa_copy_s_h(out0, 5);
+  mb_dq_coeff[96] = __msa_copy_s_h(out1, 1);
+  mb_dq_coeff[112] = __msa_copy_s_h(out1, 5);
+  mb_dq_coeff[128] = __msa_copy_s_h(out0, 2);
+  mb_dq_coeff[144] = __msa_copy_s_h(out0, 6);
+  mb_dq_coeff[160] = __msa_copy_s_h(out1, 2);
+  mb_dq_coeff[176] = __msa_copy_s_h(out1, 6);
+  mb_dq_coeff[192] = __msa_copy_s_h(out0, 3);
+  mb_dq_coeff[208] = __msa_copy_s_h(out0, 7);
+  mb_dq_coeff[224] = __msa_copy_s_h(out1, 3);
+  mb_dq_coeff[240] = __msa_copy_s_h(out1, 7);
 }
 
 static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
                                        uint8_t *dest, int32_t dest_stride) {
   v8i16 input0, input1, dequant_in0, dequant_in1, mul0, mul1;
-  v8i16 in0, in1, in2, in3;
-  v8i16 hz0_h, hz1_h, hz2_h, hz3_h;
-  v16i8 dest0, dest1, dest2, dest3;
-  v4i32 hz0_w, hz1_w, hz2_w, hz3_w;
-  v4i32 vt0, vt1, vt2, vt3, res0, res1, res2, res3;
+  v8i16 in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h;
+  v16u8 dest0, dest1, dest2, dest3;
+  v4i32 hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3, res0, res1, res2, res3;
   v2i64 zero = { 0 };
-  v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
 
   LD_SH2(input, 8, input0, input1);
   LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
@@ -196,7 +196,7 @@ static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
   VP8_IDCT_1D_W(hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3);
   SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
   TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
-  LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+  LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
   ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
              res2, res3);
   ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1, res2,
@@ -206,19 +206,17 @@ static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
   res1 = CLIP_SW_0_255(res1);
   res2 = CLIP_SW_0_255(res2);
   res3 = CLIP_SW_0_255(res3);
-  VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
-  VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
-  ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dest, dest_stride);
 }
 
 static void dequant_idct4x4_addblk_2x_msa(int16_t *input,
                                           int16_t *dequant_input, uint8_t *dest,
                                           int32_t dest_stride) {
   v16u8 dest0, dest1, dest2, dest3;
-  v8i16 in0, in1, in2, in3;
-  v8i16 mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
-  v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
-  v8i16 res0, res1, res2, res3;
+  v8i16 in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
+  v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3;
   v4i32 hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
   v4i32 vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
   v16i8 zero = { 0 };
@@ -247,11 +245,8 @@ static void dequant_idct4x4_addblk_2x_msa(int16_t *input,
              res2, res3);
   ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
   CLIP_SH4_0_255(res0, res1, res2, res3);
-  PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1, res2,
-              res3);
-  PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
-  PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
-  ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SW(res1, res0, res3, res2, vt0l, vt1l);
+  ST8x4_UB(vt0l, vt1l, dest, dest_stride);
 
   __asm__ __volatile__(
       "sw   $zero,    0(%[input])  \n\t"
@@ -276,10 +271,9 @@ static void dequant_idct4x4_addblk_2x_msa(int16_t *input,
 
 static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input,
                                          uint8_t *dest, int32_t dest_stride) {
-  v8i16 input_dc0, input_dc1, vec;
+  v8i16 input_dc0, input_dc1, vec, res0, res1, res2, res3;
   v16u8 dest0, dest1, dest2, dest3;
   v16i8 zero = { 0 };
-  v8i16 res0, res1, res2, res3;
 
   input_dc0 = __msa_fill_h(input[0] * dequant_input[0]);
   input_dc1 = __msa_fill_h(input[16] * dequant_input[0]);
@@ -292,11 +286,8 @@ static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input,
              res2, res3);
   ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
   CLIP_SH4_0_255(res0, res1, res2, res3);
-  PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1, res2,
-              res3);
-  PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
-  PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
-  ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+  PCKEV_B2_SH(res1, res0, res3, res2, res0, res1);
+  ST8x4_UB(res0, res1, dest, dest_stride);
 }
 
 void vp8_short_idct4x4llm_msa(int16_t *input, uint8_t *pred_ptr,
diff --git a/libs/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c b/libs/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c
index f5f1790ef5..98a4fc09a3 100644
--- a/libs/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c
+++ b/libs/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c
@@ -24,208 +24,145 @@
     mask = ((v16u8)mask <= b_limit);                          \
   }
 
-#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out, \
-                           mask_in, hev_in)                            \
-  {                                                                    \
-    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                \
-    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                          \
-    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;            \
-                                                                       \
-    p1_m = (v16i8)__msa_xori_b(p1_in_out, 0x80);                       \
-    p0_m = (v16i8)__msa_xori_b(p0_in_out, 0x80);                       \
-    q0_m = (v16i8)__msa_xori_b(q0_in_out, 0x80);                       \
-    q1_m = (v16i8)__msa_xori_b(q1_in_out, 0x80);                       \
-                                                                       \
-    filt = __msa_subs_s_b(p1_m, q1_m);                                 \
-                                                                       \
-    filt = filt & (v16i8)hev_in;                                       \
-                                                                       \
-    q0_sub_p0 = q0_m - p0_m;                                           \
-    filt_sign = __msa_clti_s_b(filt, 0);                               \
-                                                                       \
-    cnst3h = __msa_ldi_h(3);                                           \
-    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);           \
-    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);   \
-    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                     \
-    filt_r += q0_sub_p0_r;                                             \
-    filt_r = __msa_sat_s_h(filt_r, 7);                                 \
-                                                                       \
-    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);           \
-    q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);   \
-    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                     \
-    filt_l += q0_sub_p0_l;                                             \
-    filt_l = __msa_sat_s_h(filt_l, 7);                                 \
-                                                                       \
-    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                \
-    filt = filt & (v16i8)mask_in;                                      \
-                                                                       \
-    cnst4b = __msa_ldi_b(4);                                           \
-    filt1 = __msa_adds_s_b(filt, cnst4b);                              \
-    filt1 >>= 3;                                                       \
-                                                                       \
-    cnst3b = __msa_ldi_b(3);                                           \
-    filt2 = __msa_adds_s_b(filt, cnst3b);                              \
-    filt2 >>= 3;                                                       \
-                                                                       \
-    q0_m = __msa_subs_s_b(q0_m, filt1);                                \
-    q0_in_out = __msa_xori_b((v16u8)q0_m, 0x80);                       \
-    p0_m = __msa_adds_s_b(p0_m, filt2);                                \
-    p0_in_out = __msa_xori_b((v16u8)p0_m, 0x80);                       \
-                                                                       \
-    filt = __msa_srari_b(filt1, 1);                                    \
-    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                        \
-    filt = filt & (v16i8)hev_in;                                       \
-                                                                       \
-    q1_m = __msa_subs_s_b(q1_m, filt);                                 \
-    q1_in_out = __msa_xori_b((v16u8)q1_m, 0x80);                       \
-    p1_m = __msa_adds_s_b(p1_m, filt);                                 \
-    p1_in_out = __msa_xori_b((v16u8)p1_m, 0x80);                       \
+#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev)      \
+  {                                                        \
+    v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
+    const v16i8 cnst4b = __msa_ldi_b(4);                   \
+    const v16i8 cnst3b = __msa_ldi_b(3);                   \
+                                                           \
+    p1_m = (v16i8)__msa_xori_b(p1, 0x80);                  \
+    p0_m = (v16i8)__msa_xori_b(p0, 0x80);                  \
+    q0_m = (v16i8)__msa_xori_b(q0, 0x80);                  \
+    q1_m = (v16i8)__msa_xori_b(q1, 0x80);                  \
+                                                           \
+    filt = __msa_subs_s_b(p1_m, q1_m);                     \
+    filt &= hev;                                           \
+    q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);                \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                \
+    filt &= mask;                                          \
+    t1 = __msa_adds_s_b(filt, cnst4b);                     \
+    t1 >>= cnst3b;                                         \
+    t2 = __msa_adds_s_b(filt, cnst3b);                     \
+    t2 >>= cnst3b;                                         \
+    q0_m = __msa_subs_s_b(q0_m, t1);                       \
+    q0 = __msa_xori_b((v16u8)q0_m, 0x80);                  \
+    p0_m = __msa_adds_s_b(p0_m, t2);                       \
+    p0 = __msa_xori_b((v16u8)p0_m, 0x80);                  \
+    filt = __msa_srari_b(t1, 1);                           \
+    hev = __msa_xori_b(hev, 0xff);                         \
+    filt &= hev;                                           \
+    q1_m = __msa_subs_s_b(q1_m, filt);                     \
+    q1 = __msa_xori_b((v16u8)q1_m, 0x80);                  \
+    p1_m = __msa_adds_s_b(p1_m, filt);                     \
+    p1 = __msa_xori_b((v16u8)p1_m, 0x80);                  \
   }
 
-#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask)         \
-  {                                                               \
-    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign;      \
-    v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign;          \
-    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;       \
-                                                                  \
-    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                      \
-    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                      \
-    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                      \
-    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                      \
-                                                                  \
-    filt = __msa_subs_s_b(p1_m, q1_m);                            \
-                                                                  \
-    q0_sub_p0 = q0_m - p0_m;                                      \
-    filt_sign = __msa_clti_s_b(filt, 0);                          \
-                                                                  \
-    cnst3h = __msa_ldi_h(3);                                      \
-    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                \
-    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \
-    q0_sub_p0_r *= cnst3h;                                        \
-    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                \
-    filt_r += q0_sub_p0_r;                                        \
-    filt_r = __msa_sat_s_h(filt_r, 7);                            \
-                                                                  \
-    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \
-    q0_sub_p0_l *= cnst3h;                                        \
-    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                \
-    filt_l += q0_sub_p0_l;                                        \
-    filt_l = __msa_sat_s_h(filt_l, 7);                            \
-                                                                  \
-    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);           \
-    filt = filt & (v16i8)(mask);                                  \
-                                                                  \
-    cnst4b = __msa_ldi_b(4);                                      \
-    filt1 = __msa_adds_s_b(filt, cnst4b);                         \
-    filt1 >>= 3;                                                  \
-                                                                  \
-    cnst3b = __msa_ldi_b(3);                                      \
-    filt2 = __msa_adds_s_b(filt, cnst3b);                         \
-    filt2 >>= 3;                                                  \
-                                                                  \
-    q0_m = __msa_subs_s_b(q0_m, filt1);                           \
-    p0_m = __msa_adds_s_b(p0_m, filt2);                           \
-    q0_in = __msa_xori_b((v16u8)q0_m, 0x80);                      \
-    p0_in = __msa_xori_b((v16u8)p0_m, 0x80);                      \
+#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) \
+  {                                                       \
+    v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2;     \
+    v16i8 q0_sub_p0;                                      \
+    const v16i8 cnst4b = __msa_ldi_b(4);                  \
+    const v16i8 cnst3b = __msa_ldi_b(3);                  \
+                                                          \
+    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);              \
+    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);              \
+    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);              \
+    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);              \
+                                                          \
+    filt = __msa_subs_s_b(p1_m, q1_m);                    \
+    q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);               \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);               \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);               \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);               \
+    filt &= mask;                                         \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                 \
+    filt1 >>= cnst3b;                                     \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                 \
+    filt2 >>= cnst3b;                                     \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                   \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                   \
+    q0_in = __msa_xori_b((v16u8)q0_m, 0x80);              \
+    p0_in = __msa_xori_b((v16u8)p0_m, 0x80);              \
   }
 
-#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev)           \
-  {                                                               \
-    v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                     \
-    v16i8 filt, q0_sub_p0, cnst4b, cnst3b;                        \
-    v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign;             \
-    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l;     \
-    v8i16 cnst3h, cnst27h, cnst18h, cnst63h;                      \
-                                                                  \
-    cnst3h = __msa_ldi_h(3);                                      \
-                                                                  \
-    p2_m = (v16i8)__msa_xori_b(p2, 0x80);                         \
-    p1_m = (v16i8)__msa_xori_b(p1, 0x80);                         \
-    p0_m = (v16i8)__msa_xori_b(p0, 0x80);                         \
-    q0_m = (v16i8)__msa_xori_b(q0, 0x80);                         \
-    q1_m = (v16i8)__msa_xori_b(q1, 0x80);                         \
-    q2_m = (v16i8)__msa_xori_b(q2, 0x80);                         \
-                                                                  \
-    filt = __msa_subs_s_b(p1_m, q1_m);                            \
-    q0_sub_p0 = q0_m - p0_m;                                      \
-    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                \
-    filt_sign = __msa_clti_s_b(filt, 0);                          \
-                                                                  \
-    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \
-    q0_sub_p0_r *= cnst3h;                                        \
-    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                \
-    filt_r = filt_r + q0_sub_p0_r;                                \
-    filt_r = __msa_sat_s_h(filt_r, 7);                            \
-                                                                  \
-    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \
-    q0_sub_p0_l *= cnst3h;                                        \
-    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                \
-    filt_l = filt_l + q0_sub_p0_l;                                \
-    filt_l = __msa_sat_s_h(filt_l, 7);                            \
-                                                                  \
-    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);           \
-    filt = filt & (v16i8)mask;                                    \
-    filt2 = filt & (v16i8)hev;                                    \
-                                                                  \
-    hev = __msa_xori_b(hev, 0xff);                                \
-    filt = filt & (v16i8)hev;                                     \
-    cnst4b = __msa_ldi_b(4);                                      \
-    filt1 = __msa_adds_s_b(filt2, cnst4b);                        \
-    filt1 >>= 3;                                                  \
-    cnst3b = __msa_ldi_b(3);                                      \
-    filt2 = __msa_adds_s_b(filt2, cnst3b);                        \
-    filt2 >>= 3;                                                  \
-    q0_m = __msa_subs_s_b(q0_m, filt1);                           \
-    p0_m = __msa_adds_s_b(p0_m, filt2);                           \
-                                                                  \
-    filt_sign = __msa_clti_s_b(filt, 0);                          \
-    ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);                 \
-                                                                  \
-    cnst27h = __msa_ldi_h(27);                                    \
-    cnst63h = __msa_ldi_h(63);                                    \
-                                                                  \
-    u_r = filt_r * cnst27h;                                       \
-    u_r += cnst63h;                                               \
-    u_r >>= 7;                                                    \
-    u_r = __msa_sat_s_h(u_r, 7);                                  \
-    u_l = filt_l * cnst27h;                                       \
-    u_l += cnst63h;                                               \
-    u_l >>= 7;                                                    \
-    u_l = __msa_sat_s_h(u_l, 7);                                  \
-    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                    \
-    q0_m = __msa_subs_s_b(q0_m, u);                               \
-    q0 = __msa_xori_b((v16u8)q0_m, 0x80);                         \
-    p0_m = __msa_adds_s_b(p0_m, u);                               \
-    p0 = __msa_xori_b((v16u8)p0_m, 0x80);                         \
-    cnst18h = __msa_ldi_h(18);                                    \
-    u_r = filt_r * cnst18h;                                       \
-    u_r += cnst63h;                                               \
-    u_r >>= 7;                                                    \
-    u_r = __msa_sat_s_h(u_r, 7);                                  \
-                                                                  \
-    u_l = filt_l * cnst18h;                                       \
-    u_l += cnst63h;                                               \
-    u_l >>= 7;                                                    \
-    u_l = __msa_sat_s_h(u_l, 7);                                  \
-    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                    \
-    q1_m = __msa_subs_s_b(q1_m, u);                               \
-    q1 = __msa_xori_b((v16u8)q1_m, 0x80);                         \
-    p1_m = __msa_adds_s_b(p1_m, u);                               \
-    p1 = __msa_xori_b((v16u8)p1_m, 0x80);                         \
-    u_r = filt_r << 3;                                            \
-    u_r += filt_r + cnst63h;                                      \
-    u_r >>= 7;                                                    \
-    u_r = __msa_sat_s_h(u_r, 7);                                  \
-                                                                  \
-    u_l = filt_l << 3;                                            \
-    u_l += filt_l + cnst63h;                                      \
-    u_l >>= 7;                                                    \
-    u_l = __msa_sat_s_h(u_l, 7);                                  \
-    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                    \
-    q2_m = __msa_subs_s_b(q2_m, u);                               \
-    q2 = __msa_xori_b((v16u8)q2_m, 0x80);                         \
-    p2_m = __msa_adds_s_b(p2_m, u);                               \
-    p2 = __msa_xori_b((v16u8)p2_m, 0x80);                         \
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \
+  {                                                     \
+    v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;           \
+    v16i8 u, filt, t1, t2, filt_sign, q0_sub_p0;        \
+    v8i16 filt_r, filt_l, u_r, u_l;                     \
+    v8i16 temp0, temp1, temp2, temp3;                   \
+    const v16i8 cnst4b = __msa_ldi_b(4);                \
+    const v16i8 cnst3b = __msa_ldi_b(3);                \
+    const v8i16 cnst9h = __msa_ldi_h(9);                \
+    const v8i16 cnst63h = __msa_ldi_h(63);              \
+                                                        \
+    p2_m = (v16i8)__msa_xori_b(p2, 0x80);               \
+    p1_m = (v16i8)__msa_xori_b(p1, 0x80);               \
+    p0_m = (v16i8)__msa_xori_b(p0, 0x80);               \
+    q0_m = (v16i8)__msa_xori_b(q0, 0x80);               \
+    q1_m = (v16i8)__msa_xori_b(q1, 0x80);               \
+    q2_m = (v16i8)__msa_xori_b(q2, 0x80);               \
+                                                        \
+    filt = __msa_subs_s_b(p1_m, q1_m);                  \
+    q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);             \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);             \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);             \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);             \
+    filt &= mask;                                       \
+                                                        \
+    t2 = filt & hev;                                    \
+    hev = __msa_xori_b(hev, 0xff);                      \
+    filt &= hev;                                        \
+    t1 = __msa_adds_s_b(t2, cnst4b);                    \
+    t1 >>= cnst3b;                                      \
+    t2 = __msa_adds_s_b(t2, cnst3b);                    \
+    t2 >>= cnst3b;                                      \
+    q0_m = __msa_subs_s_b(q0_m, t1);                    \
+    p0_m = __msa_adds_s_b(p0_m, t2);                    \
+    filt_sign = __msa_clti_s_b(filt, 0);                \
+    ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);       \
+    temp0 = filt_r * cnst9h;                            \
+    temp1 = temp0 + cnst63h;                            \
+    temp2 = filt_l * cnst9h;                            \
+    temp3 = temp2 + cnst63h;                            \
+                                                        \
+    u_r = temp1 >> 7;                                   \
+    u_r = __msa_sat_s_h(u_r, 7);                        \
+    u_l = temp3 >> 7;                                   \
+    u_l = __msa_sat_s_h(u_l, 7);                        \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);          \
+    q2_m = __msa_subs_s_b(q2_m, u);                     \
+    p2_m = __msa_adds_s_b(p2_m, u);                     \
+    q2 = __msa_xori_b((v16u8)q2_m, 0x80);               \
+    p2 = __msa_xori_b((v16u8)p2_m, 0x80);               \
+                                                        \
+    temp1 += temp0;                                     \
+    temp3 += temp2;                                     \
+                                                        \
+    u_r = temp1 >> 7;                                   \
+    u_r = __msa_sat_s_h(u_r, 7);                        \
+    u_l = temp3 >> 7;                                   \
+    u_l = __msa_sat_s_h(u_l, 7);                        \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);          \
+    q1_m = __msa_subs_s_b(q1_m, u);                     \
+    p1_m = __msa_adds_s_b(p1_m, u);                     \
+    q1 = __msa_xori_b((v16u8)q1_m, 0x80);               \
+    p1 = __msa_xori_b((v16u8)p1_m, 0x80);               \
+                                                        \
+    temp1 += temp0;                                     \
+    temp3 += temp2;                                     \
+                                                        \
+    u_r = temp1 >> 7;                                   \
+    u_r = __msa_sat_s_h(u_r, 7);                        \
+    u_l = temp3 >> 7;                                   \
+    u_l = __msa_sat_s_h(u_l, 7);                        \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);          \
+    q0_m = __msa_subs_s_b(q0_m, u);                     \
+    p0_m = __msa_adds_s_b(p0_m, u);                     \
+    q0 = __msa_xori_b((v16u8)q0_m, 0x80);               \
+    p0 = __msa_xori_b((v16u8)p0_m, 0x80);               \
   }
 
 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
diff --git a/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h b/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
index 65905f6c02..6bec3adec3 100644
--- a/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
@@ -1221,6 +1221,8 @@
   }
 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
+#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
 
 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
                  out2, out3)                                                \
diff --git a/libs/libvpx/vp8/common/onyx.h b/libs/libvpx/vp8/common/onyx.h
index 43e3c29b50..72fba2ec56 100644
--- a/libs/libvpx/vp8/common/onyx.h
+++ b/libs/libvpx/vp8/common/onyx.h
@@ -110,6 +110,8 @@ typedef struct {
   int Sharpness;
   int cpu_used;
   unsigned int rc_max_intra_bitrate_pct;
+  /* percent of rate boost for golden frame in CBR mode. */
+  unsigned int gf_cbr_boost_pct;
   unsigned int screen_content_mode;
 
   /* mode ->
diff --git a/libs/libvpx/vp8/common/onyxd.h b/libs/libvpx/vp8/common/onyxd.h
index e05461aad0..d3c1b0e972 100644
--- a/libs/libvpx/vp8/common/onyxd.h
+++ b/libs/libvpx/vp8/common/onyxd.h
@@ -22,6 +22,7 @@ extern "C" {
 #include "vpx/vp8.h"
 
 struct VP8D_COMP;
+struct VP8Common;
 
 typedef struct {
   int Width;
@@ -45,6 +46,7 @@ int vp8dx_receive_compressed_data(struct VP8D_COMP *comp, size_t size,
 int vp8dx_get_raw_frame(struct VP8D_COMP *comp, YV12_BUFFER_CONFIG *sd,
                         int64_t *time_stamp, int64_t *time_end_stamp,
                         vp8_ppflags_t *flags);
+int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame);
 
 vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *comp,
                                     enum vpx_ref_frame_type ref_frame_flag,
@@ -52,6 +54,7 @@ vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *comp,
 vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP *comp,
                                     enum vpx_ref_frame_type ref_frame_flag,
                                     YV12_BUFFER_CONFIG *sd);
+int vp8dx_get_quantizer(const struct VP8D_COMP *c);
 
 #ifdef __cplusplus
 }
diff --git a/libs/libvpx/vp8/common/postproc.c b/libs/libvpx/vp8/common/postproc.c
index 8b8c1701a3..d67ee8a57d 100644
--- a/libs/libvpx/vp8/common/postproc.c
+++ b/libs/libvpx/vp8/common/postproc.c
@@ -12,6 +12,7 @@
 #include "vpx_dsp_rtcd.h"
 #include "vp8_rtcd.h"
 #include "vpx_dsp/postproc.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_scale_rtcd.h"
 #include "vpx_scale/yv12config.h"
 #include "postproc.h"
@@ -321,7 +322,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
     }
   }
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if ((flags & VP8D_MFQE) && oci->postproc_state.last_frame_valid &&
       oci->current_video_frame >= 2 &&
@@ -363,7 +364,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
         oci->postproc_state.last_noise != noise_level) {
       double sigma;
       struct postproc_state *ppstate = &oci->postproc_state;
-      vp8_clear_system_state();
+      vpx_clear_system_state();
       sigma = noise_level + .5 + .6 * q / 63.0;
       ppstate->clamp =
           vpx_setup_noise(sigma, ppstate->generated_noise, oci->Width + 256);
diff --git a/libs/libvpx/vp8/common/ppflags.h b/libs/libvpx/vp8/common/ppflags.h
index 713f5dffe0..96e3af6c9c 100644
--- a/libs/libvpx/vp8/common/ppflags.h
+++ b/libs/libvpx/vp8/common/ppflags.h
@@ -19,14 +19,7 @@ enum {
   VP8D_DEBLOCK = 1 << 0,
   VP8D_DEMACROBLOCK = 1 << 1,
   VP8D_ADDNOISE = 1 << 2,
-  VP8D_DEBUG_TXT_FRAME_INFO = 1 << 3,
-  VP8D_DEBUG_TXT_MBLK_MODES = 1 << 4,
-  VP8D_DEBUG_TXT_DC_DIFF = 1 << 5,
-  VP8D_DEBUG_TXT_RATE_INFO = 1 << 6,
-  VP8D_DEBUG_DRAW_MV = 1 << 7,
-  VP8D_DEBUG_CLR_BLK_MODES = 1 << 8,
-  VP8D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
-  VP8D_MFQE = 1 << 10
+  VP8D_MFQE = 1 << 3
 };
 
 typedef struct {
diff --git a/libs/libvpx/vp8/common/reconintra.c b/libs/libvpx/vp8/common/reconintra.c
index 986074ec7e..8e2094da87 100644
--- a/libs/libvpx/vp8/common/reconintra.c
+++ b/libs/libvpx/vp8/common/reconintra.c
@@ -71,8 +71,16 @@ void vp8_build_intra_predictors_mbuv_s(
     unsigned char *uleft, unsigned char *vleft, int left_stride,
     unsigned char *upred_ptr, unsigned char *vpred_ptr, int pred_stride) {
   MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode;
+#if HAVE_VSX
+  /* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from
+     uleft_col and vleft_col. Play it safe by reserving enough stack
+     space here. */
+  unsigned char uleft_col[16];
+  unsigned char vleft_col[16];
+#else
   unsigned char uleft_col[8];
   unsigned char vleft_col[8];
+#endif
   int i;
   intra_pred_fn fn;
 
diff --git a/libs/libvpx/vp8/common/reconintra4x4.c b/libs/libvpx/vp8/common/reconintra4x4.c
index 07c9223331..64d33a2873 100644
--- a/libs/libvpx/vp8/common/reconintra4x4.c
+++ b/libs/libvpx/vp8/common/reconintra4x4.c
@@ -31,7 +31,7 @@ void vp8_init_intra4x4_predictors_internal(void) {
   pred[B_LD_PRED] = vpx_d45e_predictor_4x4;
   pred[B_RD_PRED] = vpx_d135_predictor_4x4;
   pred[B_VR_PRED] = vpx_d117_predictor_4x4;
-  pred[B_VL_PRED] = vpx_d63f_predictor_4x4;
+  pred[B_VL_PRED] = vpx_d63e_predictor_4x4;
   pred[B_HD_PRED] = vpx_d153_predictor_4x4;
   pred[B_HU_PRED] = vpx_d207_predictor_4x4;
 }
@@ -40,7 +40,15 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft,
                           int left_stride, B_PREDICTION_MODE b_mode,
                           unsigned char *dst, int dst_stride,
                           unsigned char top_left) {
-  unsigned char Aboveb[12], *Above = Aboveb + 4;
+/* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from
+   Above (aka, Aboveb + 4). Play it safe by reserving enough stack
+   space here. Similary for "Left". */
+#if HAVE_VSX
+  unsigned char Aboveb[20];
+#else
+  unsigned char Aboveb[12];
+#endif
+  unsigned char *Above = Aboveb + 4;
 #if HAVE_NEON
   // Neon intrinsics are unable to load 32 bits, or 4 8 bit values. Instead, it
   // over reads but does not use the extra 4 values.
@@ -50,6 +58,8 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft,
   // indeed read, they are not used.
   vp8_zero_array(Left, 8);
 #endif  // VPX_WITH_ASAN
+#elif HAVE_VSX
+  unsigned char Left[16];
 #else
   unsigned char Left[4];
 #endif  // HAVE_NEON
diff --git a/libs/libvpx/vp8/common/rtcd_defs.pl b/libs/libvpx/vp8/common/rtcd_defs.pl
index 8dc36f7317..3df745f75a 100644
--- a/libs/libvpx/vp8/common/rtcd_defs.pl
+++ b/libs/libvpx/vp8/common/rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
 sub vp8_common_forward_decls() {
 print <<EOF
 /*
@@ -18,121 +28,99 @@ EOF
 }
 forward_decls qw/vp8_common_forward_decls/;
 
-#
-# system state
-#
-add_proto qw/void vp8_clear_system_state/, "";
-specialize qw/vp8_clear_system_state mmx/;
-$vp8_clear_system_state_mmx=vpx_reset_mmx_state;
-
 #
 # Dequant
 #
 add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
-specialize qw/vp8_dequantize_b mmx neon msa/;
+specialize qw/vp8_dequantize_b mmx neon msa mmi/;
 
 add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
-specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa/;
-$vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2;
+specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block mmx sse2 neon dspr2 msa/;
-$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
+specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 neon dspr2 msa/;
-$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
+specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi/;
 
 #
 # Loopfilter
 #
 add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbv mmx sse2 neon dspr2 msa/;
-$vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
+specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv mmx sse2 neon dspr2 msa/;
-$vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
+specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbh mmx sse2 neon dspr2 msa/;
-$vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
+specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh mmx sse2 neon dspr2 msa/;
-$vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
+specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi/;
 
 
 add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbv mmx sse2 neon msa/;
+specialize qw/vp8_loop_filter_simple_mbv sse2 neon msa mmi/;
 $vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
-$vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
 $vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
 $vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
 $vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa;
+$vp8_loop_filter_simple_mbv_mmi=vp8_loop_filter_simple_vertical_edge_mmi;
 
 add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbh mmx sse2 neon msa/;
+specialize qw/vp8_loop_filter_simple_mbh sse2 neon msa mmi/;
 $vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
-$vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx;
 $vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
 $vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
 $vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa;
+$vp8_loop_filter_simple_mbh_mmi=vp8_loop_filter_simple_horizontal_edge_mmi;
 
 add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bv mmx sse2 neon msa/;
+specialize qw/vp8_loop_filter_simple_bv sse2 neon msa mmi/;
 $vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
-$vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
 $vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
 $vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
 $vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa;
+$vp8_loop_filter_simple_bv_mmi=vp8_loop_filter_bvs_mmi;
 
 add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bh mmx sse2 neon msa/;
+specialize qw/vp8_loop_filter_simple_bh sse2 neon msa mmi/;
 $vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
-$vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx;
 $vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
 $vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
 $vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
+$vp8_loop_filter_simple_bh_mmi=vp8_loop_filter_bhs_mmi;
 
 #
 # IDCT
 #
 #idct16
 add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride";
-specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa/;
-$vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2;
+specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa mmi/;
 
 #iwalsh1
 add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *output";
 specialize qw/vp8_short_inv_walsh4x4_1 dspr2/;
-$vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2;
-# no asm yet
 
 #iwalsh16
 add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
-specialize qw/vp8_short_inv_walsh4x4 mmx sse2 neon dspr2 msa/;
-$vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2;
+specialize qw/vp8_short_inv_walsh4x4 sse2 neon dspr2 msa mmi/;
 
 #idct1_scalar_add
 add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
-specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa/;
-$vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2;
+specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi/;
 
 #
 # RECON
 #
 add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem16x16 mmx sse2 neon dspr2 msa/;
-$vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2;
+specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa/;
-$vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2;
+specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa/;
-$vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
+specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi/;
 
 #
 # Postproc
@@ -140,13 +128,10 @@ $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
 if (vpx_config("CONFIG_POSTPROC") eq "yes") {
 
     add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
-    # no asm yet
 
     add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
-    # no asm yet
 
     add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
-    # no asm yet
 
     add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
     specialize qw/vp8_filter_by_weight16x16 sse2 msa/;
@@ -155,35 +140,28 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") {
     specialize qw/vp8_filter_by_weight8x8 sse2 msa/;
 
     add_proto qw/void vp8_filter_by_weight4x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
-    # no asm yet
 }
 
 #
 # Subpixel
 #
 add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 neon dspr2 msa/;
-$vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2;
+specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 neon dspr2 msa/;
-$vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2;
+specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 neon dspr2 msa/;
-$vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2;
+specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/;
 
-# TODO(johannkoenig): Add neon implementation
-# https://bugs.chromium.org/p/webm/issues/detail?id=1273
 add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict4x4 mmx ssse3 dspr2 msa/;
-$vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 neon msa/;
+specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/;
 
 add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 neon msa/;
+specialize qw/vp8_bilinear_predict8x8 sse2 ssse3 neon msa/;
 
 add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
 specialize qw/vp8_bilinear_predict8x4 mmx neon msa/;
@@ -208,37 +186,34 @@ if ($opts{arch} =~ /x86/) {
 # Forward DCT
 #
 add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 mmx sse2 neon msa/;
+specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi/;
 
 add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 mmx sse2 neon msa/;
+specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/;
 
 add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_walsh4x4 sse2 neon msa/;
+specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/;
 
 #
 # Quantizer
 #
 add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa/;
+specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi/;
 
 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/;
+specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa mmi/;
 
 #
 # Block subtraction
 #
 add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff";
-specialize qw/vp8_block_error mmx sse2 msa/;
-$vp8_block_error_sse2=vp8_block_error_xmm;
+specialize qw/vp8_block_error sse2 msa/;
 
 add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc";
-specialize qw/vp8_mbblock_error mmx sse2 msa/;
-$vp8_mbblock_error_sse2=vp8_mbblock_error_xmm;
+specialize qw/vp8_mbblock_error sse2 msa/;
 
 add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
-specialize qw/vp8_mbuverror mmx sse2 msa/;
-$vp8_mbuverror_sse2=vp8_mbuverror_xmm;
+specialize qw/vp8_mbuverror sse2 msa/;
 
 #
 # Motion search
@@ -249,11 +224,14 @@ $vp8_full_search_sad_sse3=vp8_full_search_sadx3;
 $vp8_full_search_sad_sse4_1=vp8_full_search_sadx8;
 
 add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-specialize qw/vp8_refining_search_sad sse3/;
-$vp8_refining_search_sad_sse3=vp8_refining_search_sadx4;
+specialize qw/vp8_refining_search_sad sse2 msa/;
+$vp8_refining_search_sad_sse2=vp8_refining_search_sadx4;
+$vp8_refining_search_sad_msa=vp8_refining_search_sadx4;
 
 add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-$vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4;
+specialize qw/vp8_diamond_search_sad sse2 msa/;
+$vp8_diamond_search_sad_sse2=vp8_diamond_search_sadx4;
+$vp8_diamond_search_sad_msa=vp8_diamond_search_sadx4;
 
 #
 # Alt-ref Noise Reduction (ARNR)
diff --git a/libs/libvpx/vp8/common/threading.h b/libs/libvpx/vp8/common/threading.h
index c829fbc279..b082bf109e 100644
--- a/libs/libvpx/vp8/common/threading.h
+++ b/libs/libvpx/vp8/common/threading.h
@@ -171,11 +171,15 @@ static inline int sem_destroy(sem_t *sem) {
 #define sem_wait(sem) (semaphore_wait(*sem))
 #define sem_post(sem) semaphore_signal(*sem)
 #define sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem)
-#define thread_sleep(nms) { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} 
+#define thread_sleep(nms)
+/* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec =
+   1000*nms;nanosleep(&ts, NULL);} */
 #else
 #include <unistd.h>
 #include <sched.h>
-#define thread_sleep(nms) {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);}
+#define thread_sleep(nms) sched_yield();
+/* {struct timespec ts;ts.tv_sec=0;
+    ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
 #endif
 /* Not Windows. Assume pthreads */
 
@@ -188,6 +192,16 @@ static inline int sem_destroy(sem_t *sem) {
 #endif
 
 #include "vpx_util/vpx_thread.h"
+#include "vpx_util/vpx_atomics.h"
+
+static INLINE void vp8_atomic_spin_wait(
+    int mb_col, const vpx_atomic_int *last_row_current_mb_col,
+    const int nsync) {
+  while (mb_col > (vpx_atomic_load_acquire(last_row_current_mb_col) - nsync)) {
+    x86_pause_hint();
+    thread_sleep(0);
+  }
+}
 
 #endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
 
diff --git a/libs/libvpx/vp8/common/vp8_loopfilter.c b/libs/libvpx/vp8/common/vp8_loopfilter.c
index c6430be465..9fb1250650 100644
--- a/libs/libvpx/vp8/common/vp8_loopfilter.c
+++ b/libs/libvpx/vp8/common/vp8_loopfilter.c
@@ -111,11 +111,9 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd,
 
     /* Note the baseline filter values for each segment */
     if (mbd->segmentation_enabled) {
-      /* Abs value */
       if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
         lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
-      } else /* Delta Value */
-      {
+      } else { /* Delta Value */
         lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
       }
       lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0;
@@ -344,8 +342,7 @@ void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int frame_type) {
 
       mode_info_context++; /* Skip border mb */
     }
-  } else /* SIMPLE_LOOPFILTER */
-  {
+  } else { /* SIMPLE_LOOPFILTER */
     for (mb_row = 0; mb_row < mb_rows; ++mb_row) {
       for (mb_col = 0; mb_col < mb_cols; ++mb_col) {
         int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
diff --git a/libs/libvpx/vp8/common/vp8_skin_detection.c b/libs/libvpx/vp8/common/vp8_skin_detection.c
new file mode 100644
index 0000000000..6739efa5fe
--- /dev/null
+++ b/libs/libvpx/vp8/common/vp8_skin_detection.c
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/vp8_skin_detection.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
+
+static int avg_2x2(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 2; ++i, s += p) {
+    for (j = 0; j < 2; ++j) {
+      sum += s[j];
+    }
+  }
+  return (sum + 2) >> 2;
+}
+
+int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                           int stride, int strideuv,
+                           SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv,
+                           int curr_motion_magn) {
+  // No skin if block has been zero/small motion for long consecutive time.
+  if (consec_zeromv > 60 && curr_motion_magn == 0) {
+    return 0;
+  } else {
+    int motion = 1;
+    if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0;
+    if (bsize == SKIN_16X16) {
+      // Take the average of center 2x2 pixels.
+      const int ysource = avg_2x2(y + 7 * stride + 7, stride);
+      const int usource = avg_2x2(u + 3 * strideuv + 3, strideuv);
+      const int vsource = avg_2x2(v + 3 * strideuv + 3, strideuv);
+      return vpx_skin_pixel(ysource, usource, vsource, motion);
+    } else {
+      int num_skin = 0;
+      int i, j;
+      for (i = 0; i < 2; i++) {
+        for (j = 0; j < 2; j++) {
+          // Take the average of center 2x2 pixels.
+          const int ysource = avg_2x2(y + 3 * stride + 3, stride);
+          const int usource = avg_2x2(u + strideuv + 1, strideuv);
+          const int vsource = avg_2x2(v + strideuv + 1, strideuv);
+          num_skin += vpx_skin_pixel(ysource, usource, vsource, motion);
+          if (num_skin >= 2) return 1;
+          y += 8;
+          u += 4;
+          v += 4;
+        }
+        y += (stride << 3) - 16;
+        u += (strideuv << 2) - 8;
+        v += (strideuv << 2) - 8;
+      }
+
+      return 0;
+    }
+  }
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp8_compute_skin_map(VP8_COMP *const cpi, FILE *yuv_skinmap_file) {
+  int i, j, mb_row, mb_col, num_bl;
+  VP8_COMMON *const cm = &cpi->common;
+  uint8_t *y;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  int offset = 0;
+
+  YV12_BUFFER_CONFIG skinmap;
+  memset(&skinmap, 0, sizeof(skinmap));
+  if (vp8_yv12_alloc_frame_buffer(&skinmap, cm->Width, cm->Height,
+                                  VP8BORDERINPIXELS) < 0) {
+    vpx_free_frame_buffer(&skinmap);
+    return;
+  }
+  memset(skinmap.buffer_alloc, 128, skinmap.frame_size);
+  y = skinmap.y_buffer;
+  // Loop through blocks and set skin map based on center pixel of block.
+  // Set y to white for skin block, otherwise set to source with gray scale.
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 1) {
+    num_bl = 0;
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 1) {
+      const int is_skin = cpi->skin_map[offset++];
+      for (i = 0; i < 16; i++) {
+        for (j = 0; j < 16; j++) {
+          y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j];
+        }
+      }
+      num_bl++;
+      y += 16;
+      src_y += 16;
+    }
+    y += (src_ystride << 4) - (num_bl << 4);
+    src_y += (src_ystride << 4) - (num_bl << 4);
+  }
+  vpx_write_yuv_frame(yuv_skinmap_file, &skinmap);
+  vpx_free_frame_buffer(&skinmap);
+}
+#endif  // OUTPUT_YUV_SKINMAP
diff --git a/libs/libvpx/vp8/common/vp8_skin_detection.h b/libs/libvpx/vp8/common/vp8_skin_detection.h
new file mode 100644
index 0000000000..4d27f5eb2e
--- /dev/null
+++ b/libs/libvpx/vp8/common/vp8_skin_detection.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_SKIN_DETECTION_H_
+#define VP8_COMMON_SKIN_DETECTION_H_
+
+#include "vp8/encoder/onyx_int.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+
+typedef enum {
+  // Skin detection based on 8x8 block. If two of them are identified as skin,
+  // the macroblock is marked as skin.
+  SKIN_8X8,
+  // Skin detection based on 16x16 block.
+  SKIN_16X16
+} SKIN_DETECTION_BLOCK_SIZE;
+
+int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                           int stride, int strideuv,
+                           SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv,
+                           int curr_motion_magn);
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp8_compute_skin_map(struct VP8_COMP *const cpi, FILE *yuv_skinmap_file);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_SKIN_DETECTION_H_
diff --git a/libs/libvpx/vp8/common/x86/copy_sse2.asm b/libs/libvpx/vp8/common/x86/copy_sse2.asm
index 86fae26956..480faa2550 100644
--- a/libs/libvpx/vp8/common/x86/copy_sse2.asm
+++ b/libs/libvpx/vp8/common/x86/copy_sse2.asm
@@ -11,6 +11,7 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
 
 ;void vp8_copy32xn_sse2(
 ;    unsigned char *src_ptr,
diff --git a/libs/libvpx/vp8/common/x86/copy_sse3.asm b/libs/libvpx/vp8/common/x86/copy_sse3.asm
index d789a40ccf..31ea898a3d 100644
--- a/libs/libvpx/vp8/common/x86/copy_sse3.asm
+++ b/libs/libvpx/vp8/common/x86/copy_sse3.asm
@@ -83,6 +83,7 @@
     ret
 %endmacro
 
+SECTION .text
 
 ;void vp8_copy32xn_sse3(
 ;    unsigned char *src_ptr,
diff --git a/libs/libvpx/vp8/common/x86/dequantize_mmx.asm b/libs/libvpx/vp8/common/x86/dequantize_mmx.asm
index 4e551f00aa..bfdd997782 100644
--- a/libs/libvpx/vp8/common/x86/dequantize_mmx.asm
+++ b/libs/libvpx/vp8/common/x86/dequantize_mmx.asm
@@ -11,6 +11,7 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
 
 ;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
 global sym(vp8_dequantize_b_impl_mmx) PRIVATE
diff --git a/libs/libvpx/vp8/common/x86/idct_blk_mmx.c b/libs/libvpx/vp8/common/x86/idct_blk_mmx.c
index 6194cdf8fa..fd804b1ca4 100644
--- a/libs/libvpx/vp8/common/x86/idct_blk_mmx.c
+++ b/libs/libvpx/vp8/common/x86/idct_blk_mmx.c
@@ -21,91 +21,3 @@ void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC) {
 
   vp8_dequantize_b_impl_mmx(sq, dq, DQC);
 }
-
-void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst,
-                                      int stride, char *eobs) {
-  int i;
-
-  for (i = 0; i < 4; ++i) {
-    if (eobs[0] > 1) {
-      vp8_dequant_idct_add_mmx(q, dq, dst, stride);
-    } else if (eobs[0] == 1) {
-      vp8_dc_only_idct_add_mmx(q[0] * dq[0], dst, stride, dst, stride);
-      memset(q, 0, 2 * sizeof(q[0]));
-    }
-
-    if (eobs[1] > 1) {
-      vp8_dequant_idct_add_mmx(q + 16, dq, dst + 4, stride);
-    } else if (eobs[1] == 1) {
-      vp8_dc_only_idct_add_mmx(q[16] * dq[0], dst + 4, stride, dst + 4, stride);
-      memset(q + 16, 0, 2 * sizeof(q[0]));
-    }
-
-    if (eobs[2] > 1) {
-      vp8_dequant_idct_add_mmx(q + 32, dq, dst + 8, stride);
-    } else if (eobs[2] == 1) {
-      vp8_dc_only_idct_add_mmx(q[32] * dq[0], dst + 8, stride, dst + 8, stride);
-      memset(q + 32, 0, 2 * sizeof(q[0]));
-    }
-
-    if (eobs[3] > 1) {
-      vp8_dequant_idct_add_mmx(q + 48, dq, dst + 12, stride);
-    } else if (eobs[3] == 1) {
-      vp8_dc_only_idct_add_mmx(q[48] * dq[0], dst + 12, stride, dst + 12,
-                               stride);
-      memset(q + 48, 0, 2 * sizeof(q[0]));
-    }
-
-    q += 64;
-    dst += 4 * stride;
-    eobs += 4;
-  }
-}
-
-void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dstu,
-                                       unsigned char *dstv, int stride,
-                                       char *eobs) {
-  int i;
-
-  for (i = 0; i < 2; ++i) {
-    if (eobs[0] > 1) {
-      vp8_dequant_idct_add_mmx(q, dq, dstu, stride);
-    } else if (eobs[0] == 1) {
-      vp8_dc_only_idct_add_mmx(q[0] * dq[0], dstu, stride, dstu, stride);
-      memset(q, 0, 2 * sizeof(q[0]));
-    }
-
-    if (eobs[1] > 1) {
-      vp8_dequant_idct_add_mmx(q + 16, dq, dstu + 4, stride);
-    } else if (eobs[1] == 1) {
-      vp8_dc_only_idct_add_mmx(q[16] * dq[0], dstu + 4, stride, dstu + 4,
-                               stride);
-      memset(q + 16, 0, 2 * sizeof(q[0]));
-    }
-
-    q += 32;
-    dstu += 4 * stride;
-    eobs += 2;
-  }
-
-  for (i = 0; i < 2; ++i) {
-    if (eobs[0] > 1) {
-      vp8_dequant_idct_add_mmx(q, dq, dstv, stride);
-    } else if (eobs[0] == 1) {
-      vp8_dc_only_idct_add_mmx(q[0] * dq[0], dstv, stride, dstv, stride);
-      memset(q, 0, 2 * sizeof(q[0]));
-    }
-
-    if (eobs[1] > 1) {
-      vp8_dequant_idct_add_mmx(q + 16, dq, dstv + 4, stride);
-    } else if (eobs[1] == 1) {
-      vp8_dc_only_idct_add_mmx(q[16] * dq[0], dstv + 4, stride, dstv + 4,
-                               stride);
-      memset(q + 16, 0, 2 * sizeof(q[0]));
-    }
-
-    q += 32;
-    dstv += 4 * stride;
-    eobs += 2;
-  }
-}
diff --git a/libs/libvpx/vp8/common/x86/idctllm_mmx.asm b/libs/libvpx/vp8/common/x86/idctllm_mmx.asm
index 96fa2c60d0..5773d9d84e 100644
--- a/libs/libvpx/vp8/common/x86/idctllm_mmx.asm
+++ b/libs/libvpx/vp8/common/x86/idctllm_mmx.asm
@@ -31,6 +31,7 @@
 ; *
 ; **************************************************************************/
 
+SECTION .text
 
 ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
 ;int pitch, unsigned char *dest,int stride)
diff --git a/libs/libvpx/vp8/common/x86/idctllm_sse2.asm b/libs/libvpx/vp8/common/x86/idctllm_sse2.asm
index bf8e2c4021..560faba000 100644
--- a/libs/libvpx/vp8/common/x86/idctllm_sse2.asm
+++ b/libs/libvpx/vp8/common/x86/idctllm_sse2.asm
@@ -19,6 +19,8 @@
 ;   int dst_stride      - 3
 ; )
 
+SECTION .text
+
 global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
 sym(vp8_idct_dequant_0_2x_sse2):
     push        rbp
diff --git a/libs/libvpx/vp8/common/x86/iwalsh_mmx.asm b/libs/libvpx/vp8/common/x86/iwalsh_mmx.asm
deleted file mode 100644
index 158c3b7458..0000000000
--- a/libs/libvpx/vp8/common/x86/iwalsh_mmx.asm
+++ /dev/null
@@ -1,140 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
-global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
-sym(vp8_short_inv_walsh4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    ; end prolog
-
-    mov         rdx, arg(0)
-    mov         rax, 30003h
-
-    movq        mm0, [rdx + 0]    ;ip[0]
-    movq        mm1, [rdx + 8]    ;ip[4]
-    movq        mm7, rax
-
-    movq        mm2, [rdx + 16]   ;ip[8]
-    movq        mm3, [rdx + 24]   ;ip[12]
-    punpcklwd   mm7, mm7          ;0003000300030003h
-    mov         rdx, arg(1)
-
-    movq        mm4, mm0
-    movq        mm5, mm1
-
-    paddw       mm4, mm3          ;ip[0] + ip[12] aka al
-    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
-
-    movq        mm6, mm4          ;temp al
-    paddw       mm4, mm5          ;al + bl
-    psubw       mm6, mm5          ;al - bl
-
-    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
-    psubw       mm1, mm2          ;ip[4] - ip[8] aka c1
-
-    movq        mm5, mm0          ;temp dl
-    paddw       mm0, mm1          ;dl + cl
-    psubw       mm5, mm1          ;dl - cl
-
-    ; 03 02 01 00
-    ; 13 12 11 10
-    ; 23 22 21 20
-    ; 33 32 31 30
-
-    movq        mm3, mm4          ; 03 02 01 00
-    punpcklwd   mm4, mm0          ; 11 01 10 00
-    punpckhwd   mm3, mm0          ; 13 03 12 02
-
-    movq        mm1, mm6          ; 23 22 21 20
-    punpcklwd   mm6, mm5          ; 31 21 30 20
-    punpckhwd   mm1, mm5          ; 33 23 32 22
-
-    movq        mm0, mm4          ; 11 01 10 00
-    movq        mm2, mm3          ; 13 03 12 02
-
-    punpckldq   mm0, mm6          ; 30 20 10 00 aka ip[0]
-    punpckhdq   mm4, mm6          ; 31 21 11 01 aka ip[4]
-
-    punpckldq   mm2, mm1          ; 32 22 12 02 aka ip[8]
-    punpckhdq   mm3, mm1          ; 33 23 13 03 aka ip[12]
-;~~~~~~~~~~~~~~~~~~~~~
-    movq        mm1, mm0
-    movq        mm5, mm4
-    paddw       mm1, mm3          ;ip[0] + ip[12] aka al
-    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
-
-    movq        mm6, mm1          ;temp al
-    paddw       mm1, mm5          ;al + bl
-    psubw       mm6, mm5          ;al - bl
-    paddw       mm1, mm7
-    paddw       mm6, mm7
-    psraw       mm1, 3
-    psraw       mm6, 3
-
-    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
-    psubw       mm4, mm2          ;ip[4] - ip[8] aka c1
-
-    movq        mm5, mm0          ;temp dl
-    paddw       mm0, mm4          ;dl + cl
-    psubw       mm5, mm4          ;dl - cl
-    paddw       mm0, mm7
-    paddw       mm5, mm7
-    psraw       mm0, 3
-    psraw       mm5, 3
-;~~~~~~~~~~~~~~~~~~~~~
-
-    movd        eax, mm1
-    movd        ecx, mm0
-    psrlq       mm0, 32
-    psrlq       mm1, 32
-    mov         word ptr[rdx+32*0], ax
-    mov         word ptr[rdx+32*1], cx
-    shr         eax, 16
-    shr         ecx, 16
-    mov         word ptr[rdx+32*4], ax
-    mov         word ptr[rdx+32*5], cx
-    movd        eax, mm1
-    movd        ecx, mm0
-    mov         word ptr[rdx+32*8], ax
-    mov         word ptr[rdx+32*9], cx
-    shr         eax, 16
-    shr         ecx, 16
-    mov         word ptr[rdx+32*12], ax
-    mov         word ptr[rdx+32*13], cx
-
-    movd        eax, mm6
-    movd        ecx, mm5
-    psrlq       mm5, 32
-    psrlq       mm6, 32
-    mov         word ptr[rdx+32*2], ax
-    mov         word ptr[rdx+32*3], cx
-    shr         eax, 16
-    shr         ecx, 16
-    mov         word ptr[rdx+32*6], ax
-    mov         word ptr[rdx+32*7], cx
-    movd        eax, mm6
-    movd        ecx, mm5
-    mov         word ptr[rdx+32*10], ax
-    mov         word ptr[rdx+32*11], cx
-    shr         eax, 16
-    shr         ecx, 16
-    mov         word ptr[rdx+32*14], ax
-    mov         word ptr[rdx+32*15], cx
-
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
diff --git a/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm b/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm
index 06e86a80b6..82d7bf91a6 100644
--- a/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm
+++ b/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
 global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE
 sym(vp8_short_inv_walsh4x4_sse2):
diff --git a/libs/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/libs/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
index 6d5aaa19db..6a3d05290f 100644
--- a/libs/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
+++ b/libs/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
@@ -125,6 +125,8 @@
         pxor        %1, [GLOBAL(t80)]
 %endmacro
 
+SECTION .text
+
 ;void vp8_loop_filter_bh_y_sse2
 ;(
 ;    unsigned char *src_ptr,
diff --git a/libs/libvpx/vp8/common/x86/loopfilter_sse2.asm b/libs/libvpx/vp8/common/x86/loopfilter_sse2.asm
index 1913abc69b..2ae028feae 100644
--- a/libs/libvpx/vp8/common/x86/loopfilter_sse2.asm
+++ b/libs/libvpx/vp8/common/x86/loopfilter_sse2.asm
@@ -276,6 +276,8 @@
 
 %endmacro
 
+SECTION .text
+
 %if ABI_IS_32BIT
 
 ;void vp8_loop_filter_horizontal_edge_sse2
diff --git a/libs/libvpx/vp8/common/x86/loopfilter_x86.c b/libs/libvpx/vp8/common/x86/loopfilter_x86.c
index aa2f8f1acf..a187d51fbe 100644
--- a/libs/libvpx/vp8/common/x86/loopfilter_x86.c
+++ b/libs/libvpx/vp8/common/x86/loopfilter_x86.c
@@ -22,13 +22,6 @@
 #define prototype_simple_loopfilter(sym) \
   void sym(unsigned char *y, int ystride, const unsigned char *blimit)
 
-prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
-prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
-prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
-prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
-prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
-prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
-
 #if HAVE_SSE2 && ARCH_X86_64
 prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
 prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
@@ -44,105 +37,6 @@ extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
 extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
 extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
 
-#if HAVE_MMX
-/* Horizontal MB filtering */
-void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr,
-                             unsigned char *v_ptr, int y_stride, int uv_stride,
-                             loop_filter_info *lfi) {
-  vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim,
-                                        lfi->hev_thr, 2);
-
-  if (u_ptr) {
-    vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim,
-                                          lfi->lim, lfi->hev_thr, 1);
-  }
-
-  if (v_ptr) {
-    vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim,
-                                          lfi->lim, lfi->hev_thr, 1);
-  }
-}
-
-/* Vertical MB Filtering */
-void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr,
-                             unsigned char *v_ptr, int y_stride, int uv_stride,
-                             loop_filter_info *lfi) {
-  vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 2);
-
-  if (u_ptr) {
-    vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                        lfi->hev_thr, 1);
-  }
-
-  if (v_ptr) {
-    vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                        lfi->hev_thr, 1);
-  }
-}
-
-/* Horizontal B Filtering */
-void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr,
-                            unsigned char *v_ptr, int y_stride, int uv_stride,
-                            loop_filter_info *lfi) {
-  vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim,
-                                      lfi->lim, lfi->hev_thr, 2);
-  vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim,
-                                      lfi->lim, lfi->hev_thr, 2);
-  vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr) {
-    vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride,
-                                        lfi->blim, lfi->lim, lfi->hev_thr, 1);
-  }
-
-  if (v_ptr) {
-    vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride,
-                                        lfi->blim, lfi->lim, lfi->hev_thr, 1);
-  }
-}
-
-void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,
-                             const unsigned char *blimit) {
-  vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride,
-                                             blimit);
-  vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride,
-                                             blimit);
-  vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride,
-                                             blimit);
-}
-
-/* Vertical B Filtering */
-void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr,
-                            unsigned char *v_ptr, int y_stride, int uv_stride,
-                            loop_filter_info *lfi) {
-  vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
-                                    lfi->hev_thr, 2);
-  vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
-                                    lfi->hev_thr, 2);
-  vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
-                                    lfi->hev_thr, 2);
-
-  if (u_ptr) {
-    vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
-                                      lfi->hev_thr, 1);
-  }
-
-  if (v_ptr) {
-    vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
-                                      lfi->hev_thr, 1);
-  }
-}
-
-void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
-                             const unsigned char *blimit) {
-  vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
-  vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
-  vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
-}
-#endif
-
 /* Horizontal MB filtering */
 #if HAVE_SSE2
 void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
diff --git a/libs/libvpx/vp8/common/x86/mfqe_sse2.asm b/libs/libvpx/vp8/common/x86/mfqe_sse2.asm
index 8177b79226..3fde973ad2 100644
--- a/libs/libvpx/vp8/common/x86/mfqe_sse2.asm
+++ b/libs/libvpx/vp8/common/x86/mfqe_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void vp8_filter_by_weight16x16_sse2
 ;(
 ;    unsigned char *src,
diff --git a/libs/libvpx/vp8/common/x86/recon_mmx.asm b/libs/libvpx/vp8/common/x86/recon_mmx.asm
index 15e98713c7..e6a48f6b07 100644
--- a/libs/libvpx/vp8/common/x86/recon_mmx.asm
+++ b/libs/libvpx/vp8/common/x86/recon_mmx.asm
@@ -11,6 +11,7 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
 
 ;void copy_mem8x8_mmx(
 ;    unsigned char *src,
@@ -117,158 +118,3 @@ sym(vp8_copy_mem8x4_mmx):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-
-;void copy_mem16x16_mmx(
-;    unsigned char *src,
-;    int src_stride,
-;    unsigned char *dst,
-;    int dst_stride
-;    )
-global sym(vp8_copy_mem16x16_mmx) PRIVATE
-sym(vp8_copy_mem16x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src;
-        movsxd      rax,        dword ptr arg(1) ;src_stride;
-
-        mov         rdi,        arg(2) ;dst;
-        movsxd      rcx,        dword ptr arg(3) ;dst_stride
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/libs/libvpx/vp8/common/x86/recon_sse2.asm b/libs/libvpx/vp8/common/x86/recon_sse2.asm
index cb89537f76..57f8899c77 100644
--- a/libs/libvpx/vp8/common/x86/recon_sse2.asm
+++ b/libs/libvpx/vp8/common/x86/recon_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void copy_mem16x16_sse2(
 ;    unsigned char *src,
 ;    int src_stride,
diff --git a/libs/libvpx/vp8/common/x86/subpixel_mmx.asm b/libs/libvpx/vp8/common/x86/subpixel_mmx.asm
index 47dd452297..1f3a2baca0 100644
--- a/libs/libvpx/vp8/common/x86/subpixel_mmx.asm
+++ b/libs/libvpx/vp8/common/x86/subpixel_mmx.asm
@@ -17,6 +17,7 @@ extern sym(vp8_bilinear_filters_x86_8)
 %define vp8_filter_weight 128
 %define VP8_FILTER_SHIFT  7
 
+SECTION .text
 
 ;void vp8_filter_block1d_h6_mmx
 ;(
@@ -204,163 +205,6 @@ sym(vp8_filter_block1dc_v6_mmx):
     ret
 
 
-;void bilinear_predict8x8_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;   unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
-sym(vp8_bilinear_predict8x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
-    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        shl         rax,        5 ; offset * 32
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-
-        add         rax,        rcx ; HFilter
-        mov         rsi,        arg(0) ;src_ptr              ;
-
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-
-        shl         rax,        5 ; offset*32
-        add         rax,        rcx ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]          ;
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-
-
-        ; get the first horizontal line done       ;
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP8_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_8x8:
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        movq        mm5,        mm7                 ;
-        movq        mm6,        mm7                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0
-
-        pmullw      mm5,        [rax]               ;
-        pmullw      mm6,        [rax]               ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP8_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-
-        pmullw      mm3,        [rax+16]            ;
-        pmullw      mm4,        [rax+16]            ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP8_FILTER_SHIFT        ;
-
-        packuswb    mm3,        mm4
-
-        movq        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8                  ;dst_pitch
-%endif
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_8x8
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 ;void bilinear_predict8x4_mmx
 ;(
 ;    unsigned char  *src_ptr,
@@ -641,8 +485,8 @@ rd:
     times 4 dw 0x40
 
 align 16
-global HIDDEN_DATA(sym(vp8_six_tap_mmx))
-sym(vp8_six_tap_mmx):
+global HIDDEN_DATA(sym(vp8_six_tap_x86))
+sym(vp8_six_tap_x86):
     times 8 dw 0
     times 8 dw 0
     times 8 dw 128
diff --git a/libs/libvpx/vp8/common/x86/subpixel_sse2.asm b/libs/libvpx/vp8/common/x86/subpixel_sse2.asm
index ca00583ca2..6e70f6d2e8 100644
--- a/libs/libvpx/vp8/common/x86/subpixel_sse2.asm
+++ b/libs/libvpx/vp8/common/x86/subpixel_sse2.asm
@@ -16,6 +16,7 @@ extern sym(vp8_bilinear_filters_x86_8)
 %define VP8_FILTER_WEIGHT 128
 %define VP8_FILTER_SHIFT  7
 
+SECTION .text
 
 ;/************************************************************************************
 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
diff --git a/libs/libvpx/vp8/common/x86/subpixel_ssse3.asm b/libs/libvpx/vp8/common/x86/subpixel_ssse3.asm
index 1f6cbd1d1e..8d55c9320c 100644
--- a/libs/libvpx/vp8/common/x86/subpixel_ssse3.asm
+++ b/libs/libvpx/vp8/common/x86/subpixel_ssse3.asm
@@ -15,6 +15,7 @@
 %define VP8_FILTER_WEIGHT 128
 %define VP8_FILTER_SHIFT  7
 
+SECTION .text
 
 ;/************************************************************************************
 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
diff --git a/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c b/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c
index af2c1e7633..b9d087e20d 100644
--- a/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c
+++ b/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c
@@ -13,7 +13,7 @@
 #include "vpx_ports/mem.h"
 #include "filter_x86.h"
 
-extern const short vp8_six_tap_mmx[8][6 * 8];
+extern const short vp8_six_tap_x86[8][6 * 8];
 
 extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
                                       unsigned short *output_ptr,
@@ -82,103 +82,13 @@ void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
   DECLARE_ALIGNED(16, unsigned short,
                   FData2[16 * 16]); /* Temp data bufffer used in filtering */
   const short *HFilter, *VFilter;
-  HFilter = vp8_six_tap_mmx[xoffset];
+  HFilter = vp8_six_tap_x86[xoffset];
   vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
                             src_pixels_per_line, 1, 9, 8, HFilter);
-  VFilter = vp8_six_tap_mmx[yoffset];
+  VFilter = vp8_six_tap_x86[yoffset];
   vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
                              VFilter);
 }
-
-void vp8_sixtap_predict16x16_mmx(unsigned char *src_ptr,
-                                 int src_pixels_per_line, int xoffset,
-                                 int yoffset, unsigned char *dst_ptr,
-                                 int dst_pitch) {
-  DECLARE_ALIGNED(16, unsigned short,
-                  FData2[24 * 24]); /* Temp data bufffer used in filtering */
-
-  const short *HFilter, *VFilter;
-
-  HFilter = vp8_six_tap_mmx[xoffset];
-
-  vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
-                            src_pixels_per_line, 1, 21, 32, HFilter);
-  vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4,
-                            src_pixels_per_line, 1, 21, 32, HFilter);
-  vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8,
-                            src_pixels_per_line, 1, 21, 32, HFilter);
-  vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
-                            FData2 + 12, src_pixels_per_line, 1, 21, 32,
-                            HFilter);
-
-  VFilter = vp8_six_tap_mmx[yoffset];
-  vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, 16,
-                             VFilter);
-  vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16, 16,
-                             16, VFilter);
-  vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16, 16,
-                             16, VFilter);
-  vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16, 16,
-                             16, VFilter);
-}
-
-void vp8_sixtap_predict8x8_mmx(unsigned char *src_ptr, int src_pixels_per_line,
-                               int xoffset, int yoffset, unsigned char *dst_ptr,
-                               int dst_pitch) {
-  DECLARE_ALIGNED(16, unsigned short,
-                  FData2[256]); /* Temp data bufffer used in filtering */
-
-  const short *HFilter, *VFilter;
-
-  HFilter = vp8_six_tap_mmx[xoffset];
-  vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
-                            src_pixels_per_line, 1, 13, 16, HFilter);
-  vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4,
-                            src_pixels_per_line, 1, 13, 16, HFilter);
-
-  VFilter = vp8_six_tap_mmx[yoffset];
-  vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8, 8,
-                             VFilter);
-  vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8, 8, 8,
-                             VFilter);
-}
-
-void vp8_sixtap_predict8x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
-                               int xoffset, int yoffset, unsigned char *dst_ptr,
-                               int dst_pitch) {
-  DECLARE_ALIGNED(16, unsigned short,
-                  FData2[256]); /* Temp data bufffer used in filtering */
-
-  const short *HFilter, *VFilter;
-
-  HFilter = vp8_six_tap_mmx[xoffset];
-  vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
-                            src_pixels_per_line, 1, 9, 16, HFilter);
-  vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4,
-                            src_pixels_per_line, 1, 9, 16, HFilter);
-
-  VFilter = vp8_six_tap_mmx[yoffset];
-  vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4, 8,
-                             VFilter);
-  vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8, 4, 8,
-                             VFilter);
-}
-
-void vp8_bilinear_predict16x16_mmx(unsigned char *src_ptr,
-                                   int src_pixels_per_line, int xoffset,
-                                   int yoffset, unsigned char *dst_ptr,
-                                   int dst_pitch) {
-  vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr, dst_pitch);
-  vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset,
-                              yoffset, dst_ptr + 8, dst_pitch);
-  vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,
-                              src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr + dst_pitch * 8, dst_pitch);
-  vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8,
-                              src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr + dst_pitch * 8 + 8, dst_pitch);
-}
 #endif
 
 #if HAVE_SSE2
@@ -195,21 +105,21 @@ void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
 
   if (xoffset) {
     if (yoffset) {
-      HFilter = vp8_six_tap_mmx[xoffset];
+      HFilter = vp8_six_tap_x86[xoffset];
       vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
                                    src_pixels_per_line, 1, 21, 32, HFilter);
-      VFilter = vp8_six_tap_mmx[yoffset];
+      VFilter = vp8_six_tap_x86[yoffset];
       vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
                                    dst_pitch, VFilter);
     } else {
       /* First-pass only */
-      HFilter = vp8_six_tap_mmx[xoffset];
+      HFilter = vp8_six_tap_x86[xoffset];
       vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
                                         dst_pitch, 16, HFilter);
     }
   } else {
     /* Second-pass only */
-    VFilter = vp8_six_tap_mmx[yoffset];
+    VFilter = vp8_six_tap_x86[yoffset];
     vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
                                  src_pixels_per_line, 21, 32);
     vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
@@ -226,21 +136,21 @@ void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
 
   if (xoffset) {
     if (yoffset) {
-      HFilter = vp8_six_tap_mmx[xoffset];
+      HFilter = vp8_six_tap_x86[xoffset];
       vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
                                   src_pixels_per_line, 1, 13, 16, HFilter);
-      VFilter = vp8_six_tap_mmx[yoffset];
+      VFilter = vp8_six_tap_x86[yoffset];
       vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
                                   dst_pitch, VFilter);
     } else {
       /* First-pass only */
-      HFilter = vp8_six_tap_mmx[xoffset];
+      HFilter = vp8_six_tap_x86[xoffset];
       vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
                                        dst_pitch, 8, HFilter);
     }
   } else {
     /* Second-pass only */
-    VFilter = vp8_six_tap_mmx[yoffset];
+    VFilter = vp8_six_tap_x86[yoffset];
     vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
                                      src_pixels_per_line, dst_ptr, dst_pitch, 8,
                                      VFilter);
@@ -256,21 +166,21 @@ void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
 
   if (xoffset) {
     if (yoffset) {
-      HFilter = vp8_six_tap_mmx[xoffset];
+      HFilter = vp8_six_tap_x86[xoffset];
       vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
                                   src_pixels_per_line, 1, 9, 16, HFilter);
-      VFilter = vp8_six_tap_mmx[yoffset];
+      VFilter = vp8_six_tap_x86[yoffset];
       vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
                                   dst_pitch, VFilter);
     } else {
       /* First-pass only */
-      HFilter = vp8_six_tap_mmx[xoffset];
+      HFilter = vp8_six_tap_x86[xoffset];
       vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
                                        dst_pitch, 4, HFilter);
     }
   } else {
     /* Second-pass only */
-    VFilter = vp8_six_tap_mmx[yoffset];
+    VFilter = vp8_six_tap_x86[yoffset];
     vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
                                      src_pixels_per_line, dst_ptr, dst_pitch, 4,
                                      VFilter);
diff --git a/libs/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm b/libs/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm
deleted file mode 100644
index 88a07b9f3f..0000000000
--- a/libs/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm
+++ /dev/null
@@ -1,1753 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-;void vp8_loop_filter_horizontal_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int src_pixel_step,
-;    const char *blimit,
-;    const char *limit,
-;    const char *thresh,
-;    int  count
-;)
-global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE
-sym(vp8_loop_filter_horizontal_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 32                         ; reserve 32 bytes
-    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        movsxd      rcx, dword ptr arg(5) ;count
-.next8_h:
-        mov         rdx, arg(3) ;limit
-        movq        mm7, [rdx]
-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-
-        ; calculate breakout conditions
-        movq        mm2, [rdi+2*rax]      ; q3
-        movq        mm1, [rsi+2*rax]      ; q2
-        movq        mm6, mm1              ; q2
-        psubusb     mm1, mm2              ; q2-=q3
-        psubusb     mm2, mm6              ; q3-=q2
-        por         mm1, mm2              ; abs(q3-q2)
-        psubusb     mm1, mm7              ;
-
-
-        movq        mm4, [rsi+rax]        ; q1
-        movq        mm3, mm4              ; q1
-        psubusb     mm4, mm6              ; q1-=q2
-        psubusb     mm6, mm3              ; q2-=q1
-        por         mm4, mm6              ; abs(q2-q1)
-
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-        movq        mm4, [rsi]            ; q0
-        movq        mm0, mm4              ; q0
-        psubusb     mm4, mm3              ; q0-=q1
-        psubusb     mm3, mm0              ; q1-=q0
-        por         mm4, mm3              ; abs(q0-q1)
-        movq        t0, mm4               ; save to t0
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-
-        neg         rax                   ; negate pitch to deal with above border
-
-        movq        mm2, [rsi+4*rax]      ; p3
-        movq        mm4, [rdi+4*rax]      ; p2
-        movq        mm5, mm4              ; p2
-        psubusb     mm4, mm2              ; p2-=p3
-        psubusb     mm2, mm5              ; p3-=p2
-        por         mm4, mm2              ; abs(p3 - p2)
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-
-        movq        mm4, [rsi+2*rax]      ; p1
-        movq        mm3, mm4              ; p1
-        psubusb     mm4, mm5              ; p1-=p2
-        psubusb     mm5, mm3              ; p2-=p1
-        por         mm4, mm5              ; abs(p2 - p1)
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-        movq        mm2, mm3              ; p1
-
-        movq        mm4, [rsi+rax]        ; p0
-        movq        mm5, mm4              ; p0
-        psubusb     mm4, mm3              ; p0-=p1
-        psubusb     mm3, mm5              ; p1-=p0
-        por         mm4, mm3              ; abs(p1 - p0)
-        movq        t1, mm4               ; save to t1
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-        movq        mm3, [rdi]            ; q1
-        movq        mm4, mm3              ; q1
-        psubusb     mm3, mm2              ; q1-=p1
-        psubusb     mm2, mm4              ; p1-=q1
-        por         mm2, mm3              ; abs(p1-q1)
-        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
-        psrlw       mm2, 1                ; abs(p1-q1)/2
-
-        movq        mm6, mm5              ; p0
-        movq        mm3, [rsi]            ; q0
-        psubusb     mm5, mm3              ; p0-=q0
-        psubusb     mm3, mm6              ; q0-=p0
-        por         mm5, mm3              ; abs(p0 - q0)
-        paddusb     mm5, mm5              ; abs(p0-q0)*2
-        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx, arg(2) ;blimit           ; get blimit
-        movq        mm7, [rdx]            ; blimit
-
-        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         mm1,    mm5
-        pxor        mm5,    mm5
-        pcmpeqb     mm1,    mm5           ; mask mm1
-
-        ; calculate high edge variance
-        mov         rdx, arg(4) ;thresh           ; get thresh
-        movq        mm7, [rdx]            ;
-        movq        mm4, t0               ; get abs (q1 - q0)
-        psubusb     mm4, mm7
-        movq        mm3, t1               ; get abs (p1 - p0)
-        psubusb     mm3, mm7
-        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
-        pcmpeqb     mm4,        mm5
-
-        pcmpeqb     mm5,        mm5
-        pxor        mm4,        mm5
-
-
-        ; start work on filters
-        movq        mm2, [rsi+2*rax]      ; p1
-        movq        mm7, [rdi]            ; q1
-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
-        psubsb      mm2, mm7              ; p1 - q1
-        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
-        movq        mm3, mm0              ; q0
-        psubsb      mm0, mm6              ; q0 - p0
-        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
-        pand        mm1, mm2                  ; mask filter values we don't care about
-        movq        mm2, mm1
-        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
-        pxor        mm0, mm0             ;
-        pxor        mm5, mm5
-        punpcklbw   mm0, mm2            ;
-        punpckhbw   mm5, mm2            ;
-        psraw       mm0, 11             ;
-        psraw       mm5, 11
-        packsswb    mm0, mm5
-        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
-        pxor        mm0, mm0              ; 0
-        movq        mm5, mm1              ; abcdefgh
-        punpcklbw   mm0, mm1              ; e0f0g0h0
-        psraw       mm0, 11               ; sign extended shift right by 3
-        pxor        mm1, mm1              ; 0
-        punpckhbw   mm1, mm5              ; a0b0c0d0
-        psraw       mm1, 11               ; sign extended shift right by 3
-        movq        mm5, mm0              ; save results
-
-        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      mm5, [GLOBAL(ones)]
-        paddsw      mm1, [GLOBAL(ones)]
-        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
-        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
-        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-        pandn       mm4, mm5              ; high edge variance additive
-
-        paddsb      mm6, mm2              ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi+rax], mm6        ; write back
-
-        movq        mm6, [rsi+2*rax]      ; p1
-        pxor        mm6, [GLOBAL(t80)]    ; reoffset
-        paddsb      mm6, mm4              ; p1+= p1 add
-        pxor        mm6, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi+2*rax], mm6      ; write back
-
-        psubsb      mm3, mm0              ; q0-= q0 add
-        pxor        mm3, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi], mm3            ; write back
-
-        psubsb      mm7, mm4              ; q1-= q1 add
-        pxor        mm7, [GLOBAL(t80)]    ; unoffset
-        movq        [rdi], mm7            ; write back
-
-        add         rsi,8
-        neg         rax
-        dec         rcx
-        jnz         .next8_h
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_loop_filter_vertical_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
-;)
-global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE
-sym(vp8_loop_filter_vertical_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 64      ; reserve 64 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
-    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi,        [rsi + rax*4 - 4]
-
-        movsxd      rcx,        dword ptr arg(5) ;count
-.next8_v:
-        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
-        add         rdi,        rax
-
-
-        ;transpose
-        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
-        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
-
-        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
-        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
-
-        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
-        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
-
-        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
-        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
-
-        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
-        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
-
-        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
-        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
-
-        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
-        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
-
-        neg         rax
-        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
-
-        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
-        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
-
-        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
-        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
-
-        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
-        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
-
-        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
-        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
-
-        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
-        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
-
-        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
-
-        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
-        psubusb     mm5,        mm7                         ; q2-q3
-
-        psubusb     mm7,        mm6                         ; q3-q2
-        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
-
-        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
-        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
-
-        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
-        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
-
-        psubusb     mm3,        mm6                         ; q1-q2
-        psubusb     mm6,        mm5                         ; q2-q1
-
-        por         mm6,        mm3                         ; mm6=abs(q2-q1)
-        lea         rdx,        srct
-
-        movq        [rdx+24],   mm5                         ; save q1
-        movq        [rdx+16],   mm0                         ; save q0
-
-        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
-        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
-
-        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
-        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
-
-        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
-        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
-
-        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
-        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
-
-        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
-        psubusb     mm2,        mm0                         ; p2-p3
-
-        psubusb     mm0,        mm1                         ; p3-p2
-        por         mm0,        mm2                         ; mm0=abs(p3-p2)
-
-        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
-        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
-
-        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
-        movq        [rdx+8],    mm3                         ; save p0
-
-        movq        [rdx],      mm2                         ; save p1
-        movq        mm5,        mm2                         ; mm5 = p1
-
-        psubusb     mm2,        mm1                         ; p1-p2
-        psubusb     mm1,        mm5                         ; p2-p1
-
-        por         mm1,        mm2                         ; mm1=abs(p2-p1)
-        mov         rdx,        arg(3) ;limit
-
-        movq        mm4,        [rdx]                       ; mm4 = limit
-        psubusb     mm7,        mm4
-
-        psubusb     mm0,        mm4
-        psubusb     mm1,        mm4
-
-        psubusb     mm6,        mm4
-        por         mm7,        mm6
-
-        por         mm0,        mm1
-        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
-        movq        mm1,        mm5                         ; p1
-
-        movq        mm7,        mm3                         ; mm3=mm7=p0
-        psubusb     mm7,        mm5                         ; p0 - p1
-
-        psubusb     mm5,        mm3                         ; p1 - p0
-        por         mm5,        mm7                         ; abs(p1-p0)
-
-        movq        t0,         mm5                         ; save abs(p1-p0)
-        lea         rdx,        srct
-
-        psubusb     mm5,        mm4
-        por         mm0,        mm5                         ; mm0=mask
-
-        movq        mm5,        [rdx+16]                    ; mm5=q0
-        movq        mm7,        [rdx+24]                    ; mm7=q1
-
-        movq        mm6,        mm5                         ; mm6=q0
-        movq        mm2,        mm7                         ; q1
-        psubusb     mm5,        mm7                         ; q0-q1
-
-        psubusb     mm7,        mm6                         ; q1-q0
-        por         mm7,        mm5                         ; abs(q1-q0)
-
-        movq        t1,         mm7                         ; save abs(q1-q0)
-        psubusb     mm7,        mm4
-
-        por         mm0,        mm7                         ; mask
-
-        movq        mm5,        mm2                         ; q1
-        psubusb     mm5,        mm1                         ; q1-=p1
-        psubusb     mm1,        mm2                         ; p1-=q1
-        por         mm5,        mm1                         ; abs(p1-q1)
-        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
-        psrlw       mm5,        1                           ; abs(p1-q1)/2
-
-        mov         rdx,        arg(2) ;blimit                      ;
-
-        movq        mm4,        [rdx]                       ;blimit
-        movq        mm1,        mm3                         ; mm1=mm3=p0
-
-        movq        mm7,        mm6                         ; mm7=mm6=q0
-        psubusb     mm1,        mm7                         ; p0-q0
-
-        psubusb     mm7,        mm3                         ; q0-p0
-        por         mm1,        mm7                         ; abs(q0-p0)
-        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
-        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         mm1,        mm0;                        ; mask
-
-        pxor        mm0,        mm0
-        pcmpeqb     mm1,        mm0
-
-        ; calculate high edge variance
-        mov         rdx,        arg(4) ;thresh            ; get thresh
-        movq        mm7,        [rdx]
-        ;
-        movq        mm4,        t0              ; get abs (q1 - q0)
-        psubusb     mm4,        mm7
-
-        movq        mm3,        t1              ; get abs (p1 - p0)
-        psubusb     mm3,        mm7
-
-        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-        pcmpeqb     mm4,        mm0
-
-        pcmpeqb     mm0,        mm0
-        pxor        mm4,        mm0
-
-
-
-        ; start work on filters
-        lea         rdx,        srct
-
-        movq        mm2,        [rdx]           ; p1
-        movq        mm7,        [rdx+24]        ; q1
-
-        movq        mm6,        [rdx+8]         ; p0
-        movq        mm0,        [rdx+16]        ; q0
-
-        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
-        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
-
-        psubsb      mm2,        mm7             ; p1 - q1
-        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
-
-        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
-        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
-
-        movq        mm3,        mm0             ; q0
-        psubsb      mm0,        mm6             ; q0 - p0
-
-        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
-
-        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
-        pand       mm1,        mm2              ; mask filter values we don't care about
-
-        movq        mm2,        mm1
-        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-
-        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-        pxor        mm0,        mm0          ;
-
-        pxor        mm5,        mm5
-        punpcklbw   mm0,        mm2         ;
-
-        punpckhbw   mm5,        mm2         ;
-        psraw       mm0,        11              ;
-
-        psraw       mm5,        11
-        packsswb    mm0,        mm5
-
-        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
-        pxor        mm0,        mm0           ; 0
-        movq        mm5,        mm1           ; abcdefgh
-
-        punpcklbw   mm0,        mm1           ; e0f0g0h0
-        psraw       mm0,        11                ; sign extended shift right by 3
-
-        pxor        mm1,        mm1           ; 0
-        punpckhbw   mm1,        mm5           ; a0b0c0d0
-
-        psraw       mm1,        11                ; sign extended shift right by 3
-        movq        mm5,        mm0              ; save results
-
-        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      mm5,        [GLOBAL(ones)]
-
-        paddsw      mm1,        [GLOBAL(ones)]
-        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
-
-        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
-        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-
-        pandn       mm4,        mm5             ; high edge variance additive
-
-        paddsb      mm6,        mm2             ; p0+= p0 add
-        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
-
-        ; mm6=p0                               ;
-        movq        mm1,        [rdx]           ; p1
-        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
-
-        paddsb      mm1,        mm4                 ; p1+= p1 add
-        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
-        ; mm6 = p0 mm1 = p1
-
-        psubsb      mm3,        mm0                 ; q0-= q0 add
-        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
-
-        ; mm3 = q0
-        psubsb      mm7,        mm4                 ; q1-= q1 add
-        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
-        ; mm7 = q1
-
-        ; transpose and write back
-        ; mm1 =    72 62 52 42 32 22 12 02
-        ; mm6 =    73 63 53 43 33 23 13 03
-        ; mm3 =    74 64 54 44 34 24 14 04
-        ; mm7 =    75 65 55 45 35 25 15 05
-
-        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
-        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
-
-        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
-        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
-
-        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
-        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
-
-        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
-        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
-
-        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
-        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
-
-        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
-        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
-
-
-        ; mm2 = 15 14 13 12 05 04 03 02
-        ; mm6 = 35 34 33 32 25 24 23 22
-        ; mm5 = 55 54 53 52 45 44 43 42
-        ; mm1 = 75 74 73 72 65 64 63 62
-
-
-
-        movd        [rsi+rax*4+2], mm2
-        psrlq       mm2,        32
-
-        movd        [rdi+rax*4+2], mm2
-        movd        [rsi+rax*2+2], mm6
-
-        psrlq       mm6,        32
-        movd        [rsi+rax+2],mm6
-
-        movd        [rsi+2],    mm1
-        psrlq       mm1,        32
-
-        movd        [rdi+2],    mm1
-        neg         rax
-
-        movd        [rdi+rax+2],mm5
-        psrlq       mm5,        32
-
-        movd        [rdi+rax*2+2], mm5
-
-        lea         rsi,        [rsi+rax*8]
-        dec         rcx
-        jnz         .next8_v
-
-    add rsp, 64
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_mbloop_filter_horizontal_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
-;)
-global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE
-sym(vp8_mbloop_filter_horizontal_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 32      ; reserve 32 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        movsxd      rcx, dword ptr arg(5) ;count
-.next8_mbh:
-        mov         rdx, arg(3) ;limit
-        movq        mm7, [rdx]
-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-
-        ; calculate breakout conditions
-        movq        mm2, [rdi+2*rax]      ; q3
-
-        movq        mm1, [rsi+2*rax]      ; q2
-        movq        mm6, mm1              ; q2
-        psubusb     mm1, mm2              ; q2-=q3
-        psubusb     mm2, mm6              ; q3-=q2
-        por         mm1, mm2              ; abs(q3-q2)
-        psubusb     mm1, mm7
-
-
-        ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
-        movq        mm4, [rsi+rax]        ; q1
-        movq        mm3, mm4              ; q1
-        psubusb     mm4, mm6              ; q1-=q2
-        psubusb     mm6, mm3              ; q2-=q1
-        por         mm4, mm6              ; abs(q2-q1)
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-
-        ; mm1 = mask,      mm3=q1, mm7 = limit
-
-        movq        mm4, [rsi]            ; q0
-        movq        mm0, mm4              ; q0
-        psubusb     mm4, mm3              ; q0-=q1
-        psubusb     mm3, mm0              ; q1-=q0
-        por         mm4, mm3              ; abs(q0-q1)
-        movq        t0, mm4               ; save to t0
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-
-        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
-
-        neg         rax                   ; negate pitch to deal with above border
-
-        movq        mm2, [rsi+4*rax]      ; p3
-        movq        mm4, [rdi+4*rax]      ; p2
-        movq        mm5, mm4              ; p2
-        psubusb     mm4, mm2              ; p2-=p3
-        psubusb     mm2, mm5              ; p3-=p2
-        por         mm4, mm2              ; abs(p3 - p2)
-        psubusb     mm4, mm7
-        por        mm1, mm4
-        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
-
-        movq        mm4, [rsi+2*rax]      ; p1
-        movq        mm3, mm4              ; p1
-        psubusb     mm4, mm5              ; p1-=p2
-        psubusb     mm5, mm3              ; p2-=p1
-        por         mm4, mm5              ; abs(p2 - p1)
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-        movq        mm2, mm3              ; p1
-
-
-        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
-
-        movq        mm4, [rsi+rax]        ; p0
-        movq        mm5, mm4              ; p0
-        psubusb     mm4, mm3              ; p0-=p1
-        psubusb     mm3, mm5              ; p1-=p0
-        por         mm4, mm3              ; abs(p1 - p0)
-        movq        t1, mm4               ; save to t1
-        psubusb     mm4, mm7
-        por        mm1, mm4
-        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
-        ; mm5 = p0
-        movq        mm3, [rdi]            ; q1
-        movq        mm4, mm3              ; q1
-        psubusb     mm3, mm2              ; q1-=p1
-        psubusb     mm2, mm4              ; p1-=q1
-        por         mm2, mm3              ; abs(p1-q1)
-        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
-        psrlw       mm2, 1                ; abs(p1-q1)/2
-
-        movq        mm6, mm5              ; p0
-        movq        mm3, mm0              ; q0
-        psubusb     mm5, mm3              ; p0-=q0
-        psubusb     mm3, mm6              ; q0-=p0
-        por         mm5, mm3              ; abs(p0 - q0)
-        paddusb     mm5, mm5              ; abs(p0-q0)*2
-        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx, arg(2) ;blimit           ; get blimit
-        movq        mm7, [rdx]            ; blimit
-
-        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         mm1,    mm5
-        pxor        mm5,    mm5
-        pcmpeqb     mm1,    mm5           ; mask mm1
-
-        ; mm1 = mask, mm0=q0,  mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
-        ; mm6 = p0,
-
-        ; calculate high edge variance
-        mov         rdx, arg(4) ;thresh           ; get thresh
-        movq        mm7, [rdx]            ;
-        movq        mm4, t0               ; get abs (q1 - q0)
-        psubusb     mm4, mm7
-        movq        mm3, t1               ; get abs (p1 - p0)
-        psubusb     mm3, mm7
-        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
-        pcmpeqb     mm4,        mm5
-
-        pcmpeqb     mm5,        mm5
-        pxor        mm4,        mm5
-
-
-
-        ; mm1 = mask, mm0=q0,  mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
-        ; mm6 = p0, mm4=hev
-        ; start work on filters
-        movq        mm2, [rsi+2*rax]      ; p1
-        movq        mm7, [rdi]            ; q1
-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
-        psubsb      mm2, mm7              ; p1 - q1
-
-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
-        movq        mm3, mm0              ; q0
-        psubsb      mm0, mm6              ; q0 - p0
-        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
-        paddsb      mm2, mm0              ; 2 * (q0 - p0)
-        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
-        pand        mm1, mm2              ; mask filter values we don't care about
-
-
-        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
-        movq        mm2, mm1              ; vp8_filter
-        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
-
-        movq        mm5,        mm2       ;
-        paddsb      mm5,        [GLOBAL(t3)];
-
-        pxor        mm0, mm0              ; 0
-        pxor        mm7, mm7              ; 0
-
-        punpcklbw   mm0, mm5              ; e0f0g0h0
-        psraw       mm0, 11               ; sign extended shift right by 3
-        punpckhbw   mm7, mm5              ; a0b0c0d0
-        psraw       mm7, 11               ; sign extended shift right by 3
-        packsswb    mm0, mm7              ; Filter2 >>=3;
-
-        movq        mm5, mm0              ; Filter2
-
-        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
-        pxor        mm0, mm0              ; 0
-        pxor        mm7, mm7              ; 0
-
-        punpcklbw   mm0, mm2              ; e0f0g0h0
-        psraw       mm0, 11               ; sign extended shift right by 3
-        punpckhbw   mm7, mm2              ; a0b0c0d0
-        psraw       mm7, 11               ; sign extended shift right by 3
-        packsswb    mm0, mm7              ; Filter2 >>=3;
-
-        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
-        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
-        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
-
-        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
-        ; vp8_filter &= ~hev;
-        ; Filter2 = vp8_filter;
-        pandn       mm4, mm1              ; vp8_filter&=~hev
-
-
-        ; mm3=qs0, mm4=filter2, mm6=ps0
-
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
-        ; s = vp8_signed_char_clamp(qs0 - u);
-        ; *oq0 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps0 + u);
-        ; *op0 = s^0x80;
-        pxor        mm0, mm0
-
-        pxor        mm1, mm1
-        pxor        mm2, mm2
-        punpcklbw   mm1, mm4
-        punpckhbw   mm2, mm4
-        pmulhw      mm1, [GLOBAL(s27)]
-        pmulhw      mm2, [GLOBAL(s27)]
-        paddw       mm1, [GLOBAL(s63)]
-        paddw       mm2, [GLOBAL(s63)]
-        psraw       mm1, 7
-        psraw       mm2, 7
-        packsswb    mm1, mm2
-
-        psubsb      mm3, mm1
-        paddsb      mm6, mm1
-
-        pxor        mm3, [GLOBAL(t80)]
-        pxor        mm6, [GLOBAL(t80)]
-        movq        [rsi+rax], mm6
-        movq        [rsi],     mm3
-
-        ; roughly 2/7th difference across boundary
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
-        ; s = vp8_signed_char_clamp(qs1 - u);
-        ; *oq1 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps1 + u);
-        ; *op1 = s^0x80;
-        pxor        mm1, mm1
-        pxor        mm2, mm2
-        punpcklbw   mm1, mm4
-        punpckhbw   mm2, mm4
-        pmulhw      mm1, [GLOBAL(s18)]
-        pmulhw      mm2, [GLOBAL(s18)]
-        paddw       mm1, [GLOBAL(s63)]
-        paddw       mm2, [GLOBAL(s63)]
-        psraw       mm1, 7
-        psraw       mm2, 7
-        packsswb    mm1, mm2
-
-        movq        mm3, [rdi]
-        movq        mm6, [rsi+rax*2]       ; p1
-
-        pxor        mm3, [GLOBAL(t80)]
-        pxor        mm6, [GLOBAL(t80)]
-
-        paddsb      mm6, mm1
-        psubsb      mm3, mm1
-
-        pxor        mm6, [GLOBAL(t80)]
-        pxor        mm3, [GLOBAL(t80)]
-        movq        [rdi], mm3
-        movq        [rsi+rax*2], mm6
-
-        ; roughly 1/7th difference across boundary
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
-        ; s = vp8_signed_char_clamp(qs2 - u);
-        ; *oq2 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps2 + u);
-        ; *op2 = s^0x80;
-        pxor        mm1, mm1
-        pxor        mm2, mm2
-        punpcklbw   mm1, mm4
-        punpckhbw   mm2, mm4
-        pmulhw      mm1, [GLOBAL(s9)]
-        pmulhw      mm2, [GLOBAL(s9)]
-        paddw       mm1, [GLOBAL(s63)]
-        paddw       mm2, [GLOBAL(s63)]
-        psraw       mm1, 7
-        psraw       mm2, 7
-        packsswb    mm1, mm2
-
-
-        movq        mm6, [rdi+rax*4]
-        neg         rax
-        movq        mm3, [rdi+rax  ]
-
-        pxor        mm6, [GLOBAL(t80)]
-        pxor        mm3, [GLOBAL(t80)]
-
-        paddsb      mm6, mm1
-        psubsb      mm3, mm1
-
-        pxor        mm6, [GLOBAL(t80)]
-        pxor        mm3, [GLOBAL(t80)]
-        movq        [rdi+rax  ], mm3
-        neg         rax
-        movq        [rdi+rax*4], mm6
-
-;EARLY_BREAK_OUT:
-        neg         rax
-        add         rsi,8
-        dec         rcx
-        jnz         .next8_mbh
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_mbloop_filter_vertical_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
-;)
-global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE
-sym(vp8_mbloop_filter_vertical_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 96      ; reserve 96 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
-    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[64];
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi,        [rsi + rax*4 - 4]
-
-        movsxd      rcx,        dword ptr arg(5) ;count
-.next8_mbv:
-        lea         rdi,        [rsi + rax]  ; rdi points to row +1 for indirect addressing
-
-        ;transpose
-        movq        mm0,        [rdi+2*rax]                 ; 77 76 75 74 73 72 71 70
-        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
-
-        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
-        punpckhbw   mm7,        mm0                         ; 77 67 76 66 75 65 74 64
-
-        punpcklbw   mm6,        mm0                         ; 73 63 72 62 71 61 70 60
-        movq        mm0,        [rsi+rax]                   ; 57 56 55 54 53 52 51 50
-
-        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
-        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
-
-        punpckhbw   mm5,        mm0                         ; 57 47 56 46 55 45 54 44
-        punpcklbw   mm4,        mm0                         ; 53 43 52 42 51 41 50 40
-
-        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
-        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
-
-        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
-        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
-
-        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
-        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
-
-        neg         rax
-
-        movq        mm7,        [rsi+rax]                   ; 37 36 35 34 33 32 31 30
-        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
-
-        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
-        punpckhbw   mm6,        mm7                         ; 37 27 36 36 35 25 34 24
-
-        punpcklbw   mm1,        mm7                         ; 33 23 32 22 31 21 30 20
-
-        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
-        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
-
-        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
-        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
-
-        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
-        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
-
-        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
-        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
-
-        lea         rdx,        srct
-        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
-
-        movq        [rdx+56],   mm7
-        psubusb     mm5,        mm7                         ; q2-q3
-
-
-        movq        [rdx+48],   mm6
-        psubusb     mm7,        mm6                         ; q3-q2
-
-        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
-        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
-
-        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
-        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
-
-        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
-        psubusb     mm3,        mm6                         ; q1-q2
-
-        psubusb     mm6,        mm5                         ; q2-q1
-        por         mm6,        mm3                         ; mm6=abs(q2-q1)
-
-        movq        [rdx+40],   mm5                         ; save q1
-        movq        [rdx+32],   mm0                         ; save q0
-
-        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
-        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
-
-        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
-        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
-
-        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
-        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
-
-        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
-        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
-
-        movq        [rdx],      mm0                         ; save p3
-        movq        [rdx+8],    mm1                         ; save p2
-
-        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
-        psubusb     mm2,        mm0                         ; p2-p3
-
-        psubusb     mm0,        mm1                         ; p3-p2
-        por         mm0,        mm2                         ; mm0=abs(p3-p2)
-
-        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
-        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
-
-        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
-        movq        [rdx+24],   mm3                         ; save p0
-
-        movq        [rdx+16],   mm2                         ; save p1
-        movq        mm5,        mm2                         ; mm5 = p1
-
-        psubusb     mm2,        mm1                         ; p1-p2
-        psubusb     mm1,        mm5                         ; p2-p1
-
-        por         mm1,        mm2                         ; mm1=abs(p2-p1)
-        mov         rdx,        arg(3) ;limit
-
-        movq        mm4,        [rdx]                       ; mm4 = limit
-        psubusb     mm7,        mm4                         ; abs(q3-q2) > limit
-
-        psubusb     mm0,        mm4                         ; abs(p3-p2) > limit
-        psubusb     mm1,        mm4                         ; abs(p2-p1) > limit
-
-        psubusb     mm6,        mm4                         ; abs(q2-q1) > limit
-        por         mm7,        mm6                         ; or
-
-        por         mm0,        mm1                         ;
-        por         mm0,        mm7                         ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
-        movq        mm1,        mm5                         ; p1
-
-        movq        mm7,        mm3                         ; mm3=mm7=p0
-        psubusb     mm7,        mm5                         ; p0 - p1
-
-        psubusb     mm5,        mm3                         ; p1 - p0
-        por         mm5,        mm7                         ; abs(p1-p0)
-
-        movq        t0,         mm5                         ; save abs(p1-p0)
-        lea         rdx,        srct
-
-        psubusb     mm5,        mm4                         ; mm5 = abs(p1-p0) > limit
-        por         mm0,        mm5                         ; mm0=mask
-
-        movq        mm5,        [rdx+32]                    ; mm5=q0
-        movq        mm7,        [rdx+40]                    ; mm7=q1
-
-        movq        mm6,        mm5                         ; mm6=q0
-        movq        mm2,        mm7                         ; q1
-        psubusb     mm5,        mm7                         ; q0-q1
-
-        psubusb     mm7,        mm6                         ; q1-q0
-        por         mm7,        mm5                         ; abs(q1-q0)
-
-        movq        t1,         mm7                         ; save abs(q1-q0)
-        psubusb     mm7,        mm4                         ; mm7=abs(q1-q0)> limit
-
-        por         mm0,        mm7                         ; mask
-
-        movq        mm5,        mm2                         ; q1
-        psubusb     mm5,        mm1                         ; q1-=p1
-        psubusb     mm1,        mm2                         ; p1-=q1
-        por         mm5,        mm1                         ; abs(p1-q1)
-        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
-        psrlw       mm5,        1                           ; abs(p1-q1)/2
-
-        mov         rdx,        arg(2) ;blimit                      ;
-
-        movq        mm4,        [rdx]                       ;blimit
-        movq        mm1,        mm3                         ; mm1=mm3=p0
-
-        movq        mm7,        mm6                         ; mm7=mm6=q0
-        psubusb     mm1,        mm7                         ; p0-q0
-
-        psubusb     mm7,        mm3                         ; q0-p0
-        por         mm1,        mm7                         ; abs(q0-p0)
-        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
-        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         mm1,        mm0;                        ; mask
-
-        pxor        mm0,        mm0
-        pcmpeqb     mm1,        mm0
-
-        ; calculate high edge variance
-        mov         rdx,        arg(4) ;thresh            ; get thresh
-        movq        mm7,        [rdx]
-        ;
-        movq        mm4,        t0              ; get abs (q1 - q0)
-        psubusb     mm4,        mm7             ; abs(q1 - q0) > thresh
-
-        movq        mm3,        t1              ; get abs (p1 - p0)
-        psubusb     mm3,        mm7             ; abs(p1 - p0)> thresh
-
-        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-        pcmpeqb     mm4,        mm0
-
-        pcmpeqb     mm0,        mm0
-        pxor        mm4,        mm0
-
-
-
-
-        ; start work on filters
-        lea         rdx,        srct
-
-        ; start work on filters
-        movq        mm2, [rdx+16]         ; p1
-        movq        mm7, [rdx+40]         ; q1
-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
-        psubsb      mm2, mm7              ; p1 - q1
-
-        movq        mm6, [rdx+24]         ; p0
-        movq        mm0, [rdx+32]         ; q0
-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
-
-        movq        mm3, mm0              ; q0
-        psubsb      mm0, mm6              ; q0 - p0
-        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
-        paddsb      mm2, mm0              ; 2 * (q0 - p0)
-        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
-        pand       mm1, mm2           ; mask filter values we don't care about
-
-        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
-        movq        mm2, mm1              ; vp8_filter
-        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
-
-        movq        mm5,        mm2       ;
-        paddsb      mm5,        [GLOBAL(t3)];
-
-        pxor        mm0, mm0              ; 0
-        pxor        mm7, mm7              ; 0
-
-        punpcklbw   mm0, mm5              ; e0f0g0h0
-        psraw       mm0, 11               ; sign extended shift right by 3
-        punpckhbw   mm7, mm5              ; a0b0c0d0
-        psraw       mm7, 11               ; sign extended shift right by 3
-        packsswb    mm0, mm7              ; Filter2 >>=3;
-
-        movq        mm5, mm0              ; Filter2
-
-        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
-        pxor        mm0, mm0              ; 0
-        pxor        mm7, mm7              ; 0
-
-        punpcklbw   mm0, mm2              ; e0f0g0h0
-        psraw       mm0, 11               ; sign extended shift right by 3
-        punpckhbw   mm7, mm2              ; a0b0c0d0
-        psraw       mm7, 11               ; sign extended shift right by 3
-        packsswb    mm0, mm7              ; Filter2 >>=3;
-
-        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
-        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
-        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
-
-        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
-        ; vp8_filter &= ~hev;
-        ; Filter2 = vp8_filter;
-        pandn       mm4, mm1              ; vp8_filter&=~hev
-
-
-        ; mm3=qs0, mm4=filter2, mm6=ps0
-
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
-        ; s = vp8_signed_char_clamp(qs0 - u);
-        ; *oq0 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps0 + u);
-        ; *op0 = s^0x80;
-        pxor        mm0, mm0
-
-        pxor        mm1, mm1
-        pxor        mm2, mm2
-        punpcklbw   mm1, mm4
-        punpckhbw   mm2, mm4
-        pmulhw      mm1, [GLOBAL(s27)]
-        pmulhw      mm2, [GLOBAL(s27)]
-        paddw       mm1, [GLOBAL(s63)]
-        paddw       mm2, [GLOBAL(s63)]
-        psraw       mm1, 7
-        psraw       mm2, 7
-        packsswb    mm1, mm2
-
-        psubsb      mm3, mm1
-        paddsb      mm6, mm1
-
-        pxor        mm3, [GLOBAL(t80)]
-        pxor        mm6, [GLOBAL(t80)]
-        movq        [rdx+24], mm6
-        movq        [rdx+32], mm3
-
-        ; roughly 2/7th difference across boundary
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
-        ; s = vp8_signed_char_clamp(qs1 - u);
-        ; *oq1 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps1 + u);
-        ; *op1 = s^0x80;
-        pxor        mm1, mm1
-        pxor        mm2, mm2
-        punpcklbw   mm1, mm4
-        punpckhbw   mm2, mm4
-        pmulhw      mm1, [GLOBAL(s18)]
-        pmulhw      mm2, [GLOBAL(s18)]
-        paddw       mm1, [GLOBAL(s63)]
-        paddw       mm2, [GLOBAL(s63)]
-        psraw       mm1, 7
-        psraw       mm2, 7
-        packsswb    mm1, mm2
-
-        movq        mm3, [rdx + 40]
-        movq        mm6, [rdx + 16]       ; p1
-        pxor        mm3, [GLOBAL(t80)]
-        pxor        mm6, [GLOBAL(t80)]
-
-        paddsb      mm6, mm1
-        psubsb      mm3, mm1
-
-        pxor        mm6, [GLOBAL(t80)]
-        pxor        mm3, [GLOBAL(t80)]
-        movq        [rdx + 40], mm3
-        movq        [rdx + 16], mm6
-
-        ; roughly 1/7th difference across boundary
-        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
-        ; s = vp8_signed_char_clamp(qs2 - u);
-        ; *oq2 = s^0x80;
-        ; s = vp8_signed_char_clamp(ps2 + u);
-        ; *op2 = s^0x80;
-        pxor        mm1, mm1
-        pxor        mm2, mm2
-        punpcklbw   mm1, mm4
-        punpckhbw   mm2, mm4
-        pmulhw      mm1, [GLOBAL(s9)]
-        pmulhw      mm2, [GLOBAL(s9)]
-        paddw       mm1, [GLOBAL(s63)]
-        paddw       mm2, [GLOBAL(s63)]
-        psraw       mm1, 7
-        psraw       mm2, 7
-        packsswb    mm1, mm2
-
-        movq        mm6, [rdx+ 8]
-        movq        mm3, [rdx+48]
-
-        pxor        mm6, [GLOBAL(t80)]
-        pxor        mm3, [GLOBAL(t80)]
-
-        paddsb      mm6, mm1
-        psubsb      mm3, mm1
-
-        pxor        mm6, [GLOBAL(t80)]          ; mm6 = 71 61 51 41 31 21 11 01
-        pxor        mm3, [GLOBAL(t80)]          ; mm3 = 76 66 56 46 36 26 15 06
-
-        ; transpose and write back
-        movq        mm0,    [rdx]               ; mm0 = 70 60 50 40 30 20 10 00
-        movq        mm1,    mm0                 ; mm0 = 70 60 50 40 30 20 10 00
-
-        punpcklbw   mm0,    mm6                 ; mm0 = 31 30 21 20 11 10 01 00
-        punpckhbw   mm1,    mm6                 ; mm3 = 71 70 61 60 51 50 41 40
-
-        movq        mm2,    [rdx+16]            ; mm2 = 72 62 52 42 32 22 12 02
-        movq        mm6,    mm2                 ; mm3 = 72 62 52 42 32 22 12 02
-
-        punpcklbw   mm2,    [rdx+24]            ; mm2 = 33 32 23 22 13 12 03 02
-        punpckhbw   mm6,    [rdx+24]            ; mm3 = 73 72 63 62 53 52 43 42
-
-        movq        mm5,    mm0                 ; mm5 = 31 30 21 20 11 10 01 00
-        punpcklwd   mm0,    mm2                 ; mm0 = 13 12 11 10 03 02 01 00
-
-        punpckhwd   mm5,    mm2                 ; mm5 = 33 32 31 30 23 22 21 20
-        movq        mm4,    mm1                 ; mm4 = 71 70 61 60 51 50 41 40
-
-        punpcklwd   mm1,    mm6                 ; mm1 = 53 52 51 50 43 42 41 40
-        punpckhwd   mm4,    mm6                 ; mm4 = 73 72 71 70 63 62 61 60
-
-        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
-        punpcklbw   mm2,    [rdx+40]            ; mm2 = 35 34 25 24 15 14 05 04
-
-        movq        mm6,    mm3                 ; mm6 = 76 66 56 46 36 26 15 06
-        punpcklbw   mm6,    [rdx+56]            ; mm6 = 37 36 27 26 17 16 07 06
-
-        movq        mm7,    mm2                 ; mm7 = 35 34 25 24 15 14 05 04
-        punpcklwd   mm2,    mm6                 ; mm2 = 17 16 15 14 07 06 05 04
-
-        punpckhwd   mm7,    mm6                 ; mm7 = 37 36 35 34 27 26 25 24
-        movq        mm6,    mm0                 ; mm6 = 13 12 11 10 03 02 01 00
-
-        punpckldq   mm0,    mm2                 ; mm0 = 07 06 05 04 03 02 01 00
-        punpckhdq   mm6,    mm2                 ; mm6 = 17 16 15 14 13 12 11 10
-
-        movq        [rsi+rax*4], mm0            ; write out
-        movq        [rdi+rax*4], mm6            ; write out
-
-        movq        mm0,    mm5                 ; mm0 = 33 32 31 30 23 22 21 20
-        punpckldq   mm0,    mm7                 ; mm0 = 27 26 25 24 23 22 20 20
-
-        punpckhdq   mm5,    mm7                 ; mm5 = 37 36 35 34 33 32 31 30
-        movq        [rsi+rax*2], mm0            ; write out
-
-        movq        [rdi+rax*2], mm5            ; write out
-        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
-
-        punpckhbw   mm2,    [rdx+40]            ; mm2 = 75 74 65 64 54 54 45 44
-        punpckhbw   mm3,    [rdx+56]            ; mm3 = 77 76 67 66 57 56 47 46
-
-        movq        mm5,    mm2                 ; mm5 = 75 74 65 64 54 54 45 44
-        punpcklwd   mm2,    mm3                 ; mm2 = 57 56 55 54 47 46 45 44
-
-        punpckhwd   mm5,    mm3                 ; mm5 = 77 76 75 74 67 66 65 64
-        movq        mm0,    mm1                 ; mm0=  53 52 51 50 43 42 41 40
-
-        movq        mm3,    mm4                 ; mm4 = 73 72 71 70 63 62 61 60
-        punpckldq   mm0,    mm2                 ; mm0 = 47 46 45 44 43 42 41 40
-
-        punpckhdq   mm1,    mm2                 ; mm1 = 57 56 55 54 53 52 51 50
-        movq        [rsi],  mm0                 ; write out
-
-        movq        [rdi],  mm1                 ; write out
-        neg         rax
-
-        punpckldq   mm3,    mm5                 ; mm3 = 67 66 65 64 63 62 61 60
-        punpckhdq   mm4,    mm5                 ; mm4 = 77 76 75 74 73 72 71 60
-
-        movq        [rsi+rax*2], mm3
-        movq        [rdi+rax*2], mm4
-
-        lea         rsi,        [rsi+rax*8]
-        dec         rcx
-
-        jnz         .next8_mbv
-
-    add rsp, 96
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_loop_filter_simple_horizontal_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit
-;)
-global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE
-sym(vp8_loop_filter_simple_horizontal_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        mov         rcx, 2                ; count
-.nexts8_h:
-        mov         rdx, arg(2) ;blimit           ; get blimit
-        movq        mm3, [rdx]            ;
-
-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-        neg         rax
-
-        ; calculate mask
-        movq        mm1, [rsi+2*rax]      ; p1
-        movq        mm0, [rdi]            ; q1
-        movq        mm2, mm1
-        movq        mm7, mm0
-        movq        mm4, mm0
-        psubusb     mm0, mm1              ; q1-=p1
-        psubusb     mm1, mm4              ; p1-=q1
-        por         mm1, mm0              ; abs(p1-q1)
-        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero
-        psrlw       mm1, 1                ; abs(p1-q1)/2
-
-        movq        mm5, [rsi+rax]        ; p0
-        movq        mm4, [rsi]            ; q0
-        movq        mm0, mm4              ; q0
-        movq        mm6, mm5              ; p0
-        psubusb     mm5, mm4              ; p0-=q0
-        psubusb     mm4, mm6              ; q0-=p0
-        por         mm5, mm4              ; abs(p0 - q0)
-        paddusb     mm5, mm5              ; abs(p0-q0)*2
-        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        mm3, mm3
-        pcmpeqb     mm5, mm3
-
-        ; start work on filters
-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
-        psubsb      mm2, mm7              ; p1 - q1
-
-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
-        movq        mm3, mm0              ; q0
-        psubsb      mm0, mm6              ; q0 - p0
-        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
-        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
-        pand        mm5, mm2              ; mask filter values we don't care about
-
-        ; do + 4 side
-        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movq        mm0, mm5              ; get a copy of filters
-        psllw       mm0, 8                ; shift left 8
-        psraw       mm0, 3                ; arithmetic shift right 11
-        psrlw       mm0, 8
-        movq        mm1, mm5              ; get a copy of filters
-        psraw       mm1, 11               ; arithmetic shift right 11
-        psllw       mm1, 8                ; shift left 8 to put it back
-
-        por         mm0, mm1              ; put the two together to get result
-
-        psubsb      mm3, mm0              ; q0-= q0 add
-        pxor        mm3, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi], mm3            ; write back
-
-
-        ; now do +3 side
-        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4
-
-        movq        mm0, mm5              ; get a copy of filters
-        psllw       mm0, 8                ; shift left 8
-        psraw       mm0, 3                ; arithmetic shift right 11
-        psrlw       mm0, 8
-        psraw       mm5, 11               ; arithmetic shift right 11
-        psllw       mm5, 8                ; shift left 8 to put it back
-        por         mm0, mm5              ; put the two together to get result
-
-
-        paddsb      mm6, mm0              ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi+rax], mm6        ; write back
-
-        add         rsi,8
-        neg         rax
-        dec         rcx
-        jnz         .nexts8_h
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_loop_filter_simple_vertical_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit
-;)
-global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE
-sym(vp8_loop_filter_simple_vertical_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 32      ; reserve 32 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi, [rsi + rax*4- 2];  ;
-        mov         rcx, 2                                      ; count
-.nexts8_v:
-
-        lea         rdi,        [rsi + rax];
-        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
-
-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
-        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
-
-        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
-        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
-
-        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
-        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
-
-        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
-        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
-
-        neg         rax
-
-        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
-
-        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
-        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
-
-        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
-        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
-
-        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
-        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
-
-        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
-        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
-
-        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
-        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
-
-        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
-        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
-
-        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
-
-
-        ; calculate mask
-        movq        mm6,        mm0                             ; p1
-        movq        mm7,        mm3                             ; q1
-        psubusb     mm7,        mm6                             ; q1-=p1
-        psubusb     mm6,        mm3                             ; p1-=q1
-        por         mm6,        mm7                             ; abs(p1-q1)
-        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero
-        psrlw       mm6,        1                               ; abs(p1-q1)/2
-
-        movq        mm5,        mm1                             ; p0
-        movq        mm4,        mm2                             ; q0
-
-        psubusb     mm5,        mm2                             ; p0-=q0
-        psubusb     mm4,        mm1                             ; q0-=p0
-
-        por         mm5,        mm4                             ; abs(p0 - q0)
-        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
-        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx,        arg(2) ;blimit                          ; get blimit
-        movq        mm7,        [rdx]
-
-        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        mm7,        mm7
-        pcmpeqb     mm5,        mm7                             ; mm5 = mask
-
-        ; start work on filters
-        movq        t0,         mm0
-        movq        t1,         mm3
-
-        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values
-        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values
-
-        psubsb      mm0,        mm3                             ; p1 - q1
-        movq        mm6,        mm1                             ; p0
-
-        movq        mm7,        mm2                             ; q0
-        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values
-
-        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values
-        movq        mm3,        mm7                             ; offseted ; q0
-
-        psubsb      mm7,        mm6                             ; q0 - p0
-        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
-
-        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
-
-        pand        mm5,        mm0                             ; mask filter values we don't care about
-
-        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movq        mm0,        mm5                             ; get a copy of filters
-        psllw       mm0,        8                               ; shift left 8
-        psraw       mm0,        3                               ; arithmetic shift right 11
-        psrlw       mm0,        8
-
-        movq        mm7,        mm5                             ; get a copy of filters
-        psraw       mm7,        11                              ; arithmetic shift right 11
-        psllw       mm7,        8                               ; shift left 8 to put it back
-
-        por         mm0,        mm7                             ; put the two together to get result
-
-        psubsb      mm3,        mm0                             ; q0-= q0sz add
-        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset
-
-        ; now do +3 side
-        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4
-
-        movq        mm0, mm5                                    ; get a copy of filters
-        psllw       mm0, 8                                      ; shift left 8
-        psraw       mm0, 3                                      ; arithmetic shift right 11
-        psrlw       mm0, 8
-
-        psraw       mm5, 11                                     ; arithmetic shift right 11
-        psllw       mm5, 8                                      ; shift left 8 to put it back
-        por         mm0, mm5                                    ; put the two together to get result
-
-        paddsb      mm6, mm0                                    ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]                          ; unoffset
-
-
-        movq        mm0,        t0
-        movq        mm4,        t1
-
-        ; mm0 = 70 60 50 40 30 20 10 00
-        ; mm6 = 71 61 51 41 31 21 11 01
-        ; mm3 = 72 62 52 42 32 22 12 02
-        ; mm4 = 73 63 53 43 33 23 13 03
-        ; transpose back to write out
-
-        movq        mm1,        mm0                         ;
-        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
-
-        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
-        movq        mm2,        mm3                         ;
-
-        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
-        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
-
-        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
-        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
-
-        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
-        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
-
-        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
-        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
-
-        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
-        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
-
-        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
-        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
-
-        psrlq       mm6,        32                          ; 33 32 31 30
-        movd        [rsi],      mm1                         ; write 43 42 41 40
-
-        movd        [rsi + rax], mm6                        ; write 33 32 31 30
-        neg         rax
-
-        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
-        psrlq       mm1,        32                          ; 53 52 51 50
-
-        movd        [rdi],      mm1                         ; write out 53 52 51 50
-        psrlq       mm5,        32                          ; 73 72 71 70
-
-        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
-
-        lea         rsi,        [rsi+rax*8]                 ; next 8
-
-        dec         rcx
-        jnz         .nexts8_v
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
-;                  int y_stride,
-;                  loop_filter_info *lfi)
-;{
-;
-;
-;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;}
-
-SECTION_RODATA
-align 16
-tfe:
-    times 8 db 0xfe
-align 16
-t80:
-    times 8 db 0x80
-align 16
-t1s:
-    times 8 db 0x01
-align 16
-t3:
-    times 8 db 0x03
-align 16
-t4:
-    times 8 db 0x04
-align 16
-ones:
-    times 4 dw 0x0001
-align 16
-s27:
-    times 4 dw 0x1b00
-align 16
-s18:
-    times 4 dw 0x1200
-align 16
-s9:
-    times 4 dw 0x0900
-align 16
-s63:
-    times 4 dw 0x003f
diff --git a/libs/libvpx/vp8/decoder/decodeframe.c b/libs/libvpx/vp8/decoder/decodeframe.c
index 0aec2a01b0..077bd3da26 100644
--- a/libs/libvpx/vp8/decoder/decodeframe.c
+++ b/libs/libvpx/vp8/decoder/decodeframe.c
@@ -930,7 +930,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
       /* When error concealment is enabled we should only check the sync
        * code if we have enough bits available
        */
-      if (!pbi->ec_active || data + 3 < data_end) {
+      if (data + 3 < data_end) {
         if (clear[0] != 0x9d || clear[1] != 0x01 || clear[2] != 0x2a) {
           vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
                              "Invalid frame sync code");
@@ -941,13 +941,19 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
        * if we have enough data. Otherwise we will end up with the wrong
        * size.
        */
-      if (!pbi->ec_active || data + 6 < data_end) {
+      if (data + 6 < data_end) {
         pc->Width = (clear[3] | (clear[4] << 8)) & 0x3fff;
         pc->horiz_scale = clear[4] >> 6;
         pc->Height = (clear[5] | (clear[6] << 8)) & 0x3fff;
         pc->vert_scale = clear[6] >> 6;
+        data += 7;
+      } else if (!pbi->ec_active) {
+        vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                           "Truncated key frame header");
+      } else {
+        /* Error concealment is active, clear the frame. */
+        data = data_end;
       }
-      data += 7;
     } else {
       memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
       memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
@@ -1199,7 +1205,8 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
   pbi->frame_corrupt_residual = 0;
 
 #if CONFIG_MULTITHREAD
-  if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION) {
+  if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) &&
+      pc->multi_token_partition != ONE_PARTITION) {
     unsigned int thread;
     vp8mt_decode_mb_rows(pbi, xd);
     vp8_yv12_extend_frame_borders(yv12_fb_new);
diff --git a/libs/libvpx/vp8/decoder/decodemv.c b/libs/libvpx/vp8/decoder/decodemv.c
index b946ab73d0..8e9600c6da 100644
--- a/libs/libvpx/vp8/decoder/decodemv.c
+++ b/libs/libvpx/vp8/decoder/decodemv.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "decodemv.h"
 #include "treereader.h"
 #include "vp8/common/entropymv.h"
 #include "vp8/common/entropymode.h"
@@ -64,8 +65,7 @@ static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc) {
   const vp8_prob *const p = (const vp8_prob *)mvc;
   int x = 0;
 
-  if (vp8_read(r, p[mvpis_short])) /* Large */
-  {
+  if (vp8_read(r, p[mvpis_short])) { /* Large */
     int i = 0;
 
     do {
@@ -284,8 +284,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi,
                              MB_MODE_INFO *mbmi) {
   vp8_reader *const bc = &pbi->mbc[8];
   mbmi->ref_frame = (MV_REFERENCE_FRAME)vp8_read(bc, pbi->prob_intra);
-  if (mbmi->ref_frame) /* inter MB */
-  {
+  if (mbmi->ref_frame) { /* inter MB */
     enum { CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
     int cnt[4];
     int *cntx = cnt;
diff --git a/libs/libvpx/vp8/decoder/onyxd_if.c b/libs/libvpx/vp8/decoder/onyxd_if.c
index c6b5660364..f516eb0c78 100644
--- a/libs/libvpx/vp8/decoder/onyxd_if.c
+++ b/libs/libvpx/vp8/decoder/onyxd_if.c
@@ -29,6 +29,7 @@
 #include "./vpx_scale_rtcd.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vp8/common/systemdependent.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #include "detokenize.h"
@@ -40,7 +41,6 @@
 #endif
 
 extern void vp8_init_loop_filter(VP8_COMMON *cm);
-extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
 static int get_free_fb(VP8_COMMON *cm);
 static void ref_cnt_fb(int *buf, int *idx, int new_idx);
 
@@ -352,7 +352,7 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
     goto decode_exit;
   }
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if (cm->show_frame) {
     cm->current_video_frame++;
@@ -383,7 +383,7 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
 
 decode_exit:
   pbi->common.error.setjmp = 0;
-  vp8_clear_system_state();
+  vpx_clear_system_state();
   return retcode;
 }
 int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
@@ -416,7 +416,7 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
   }
 
 #endif /*!CONFIG_POSTPROC*/
-  vp8_clear_system_state();
+  vpx_clear_system_state();
   return ret;
 }
 
@@ -438,47 +438,39 @@ int vp8dx_references_buffer(VP8_COMMON *oci, int ref_frame) {
 }
 
 int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf) {
-  if (!fb->use_frame_threads) {
-    /* decoder instance for single thread mode */
-    fb->pbi[0] = create_decompressor(oxcf);
-    if (!fb->pbi[0]) return VPX_CODEC_ERROR;
+  /* decoder instance for single thread mode */
+  fb->pbi[0] = create_decompressor(oxcf);
+  if (!fb->pbi[0]) return VPX_CODEC_ERROR;
 
 #if CONFIG_MULTITHREAD
-    if (setjmp(fb->pbi[0]->common.error.jmp)) {
-      vp8_remove_decoder_instances(fb);
-      memset(fb->pbi, 0, sizeof(fb->pbi) / sizeof(fb->pbi[0]));
-      vp8_clear_system_state();
-      return VPX_CODEC_ERROR;
-    }
-
-    fb->pbi[0]->common.error.setjmp = 1;
-    fb->pbi[0]->max_threads = oxcf->max_threads;
-    vp8_decoder_create_threads(fb->pbi[0]);
-    fb->pbi[0]->common.error.setjmp = 0;
-#endif
-  } else {
-    /* TODO : create frame threads and decoder instances for each
-     * thread here */
+  if (setjmp(fb->pbi[0]->common.error.jmp)) {
+    vp8_remove_decoder_instances(fb);
+    memset(fb->pbi, 0, sizeof(fb->pbi));
+    vpx_clear_system_state();
+    return VPX_CODEC_ERROR;
   }
 
+  fb->pbi[0]->common.error.setjmp = 1;
+  fb->pbi[0]->max_threads = oxcf->max_threads;
+  vp8_decoder_create_threads(fb->pbi[0]);
+  fb->pbi[0]->common.error.setjmp = 0;
+#endif
   return VPX_CODEC_OK;
 }
 
 int vp8_remove_decoder_instances(struct frame_buffers *fb) {
-  if (!fb->use_frame_threads) {
-    VP8D_COMP *pbi = fb->pbi[0];
+  VP8D_COMP *pbi = fb->pbi[0];
 
-    if (!pbi) return VPX_CODEC_ERROR;
+  if (!pbi) return VPX_CODEC_ERROR;
 #if CONFIG_MULTITHREAD
-    vp8_decoder_remove_threads(pbi);
+  vp8_decoder_remove_threads(pbi);
 #endif
 
-    /* decoder instance for single thread mode */
-    remove_decompressor(pbi);
-  } else {
-    /* TODO : remove frame threads and decoder instances for each
-     * thread here */
-  }
-
+  /* decoder instance for single thread mode */
+  remove_decompressor(pbi);
   return VPX_CODEC_OK;
 }
+
+int vp8dx_get_quantizer(const VP8D_COMP *cpi) {
+  return cpi->common.base_qindex;
+}
diff --git a/libs/libvpx/vp8/decoder/onyxd_int.h b/libs/libvpx/vp8/decoder/onyxd_int.h
index 3a8a431465..5ecacdbb97 100644
--- a/libs/libvpx/vp8/decoder/onyxd_int.h
+++ b/libs/libvpx/vp8/decoder/onyxd_int.h
@@ -47,9 +47,6 @@ struct frame_buffers {
    * this struct will be populated with frame buffer management
    * info in future commits. */
 
-  /* enable/disable frame-based threading */
-  int use_frame_threads;
-
   /* decoder instances */
   struct VP8D_COMP *pbi[MAX_FB_MT_DEC];
 };
@@ -70,7 +67,8 @@ typedef struct VP8D_COMP {
 
 #if CONFIG_MULTITHREAD
   /* variable for threading */
-  volatile int b_multithreaded_rd;
+
+  vpx_atomic_int b_multithreaded_rd;
   int max_threads;
   int current_mb_col_main;
   unsigned int decoding_thread_count;
@@ -78,7 +76,8 @@ typedef struct VP8D_COMP {
 
   int mt_baseline_filter_level[MAX_MB_SEGMENTS];
   int sync_range;
-  int *mt_current_mb_col; /* Each row remembers its already decoded column. */
+  /* Each row remembers its already decoded column. */
+  vpx_atomic_int *mt_current_mb_col;
 
   unsigned char **mt_yabove_row; /* mb_rows x width */
   unsigned char **mt_uabove_row;
@@ -119,6 +118,8 @@ typedef struct VP8D_COMP {
   void *decrypt_state;
 } VP8D_COMP;
 
+void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
+void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
 int vp8_decode_frame(VP8D_COMP *cpi);
 
 int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
diff --git a/libs/libvpx/vp8/decoder/threading.c b/libs/libvpx/vp8/decoder/threading.c
index 88f7723e33..d0213f75c1 100644
--- a/libs/libvpx/vp8/decoder/threading.c
+++ b/libs/libvpx/vp8/decoder/threading.c
@@ -20,6 +20,7 @@
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/extend.h"
 #include "vpx_ports/vpx_timer.h"
+#include "decoderthreading.h"
 #include "detokenize.h"
 #include "vp8/common/reconintra4x4.h"
 #include "vp8/common/reconinter.h"
@@ -36,8 +37,6 @@
     memset((p), 0, (n) * sizeof(*(p)));                             \
   } while (0)
 
-void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
-
 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
                                        MB_ROW_DEC *mbrd, int count) {
   VP8_COMMON *const pc = &pbi->common;
@@ -50,9 +49,6 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
     mbd->subpixel_predict8x8 = xd->subpixel_predict8x8;
     mbd->subpixel_predict16x16 = xd->subpixel_predict16x16;
 
-    mbd->mode_info_context = pc->mi + pc->mode_info_stride * (i + 1);
-    mbd->mode_info_stride = pc->mode_info_stride;
-
     mbd->frame_type = pc->frame_type;
     mbd->pre = xd->pre;
     mbd->dst = xd->dst;
@@ -83,7 +79,8 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
     if (pc->full_pixel) mbd->fullpixel_mask = 0xfffffff8;
   }
 
-  for (i = 0; i < pc->mb_rows; ++i) pbi->mt_current_mb_col[i] = -1;
+  for (i = 0; i < pc->mb_rows; ++i)
+    vpx_atomic_store_release(&pbi->mt_current_mb_col[i], -1);
 }
 
 static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
@@ -251,12 +248,13 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
 static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
                               int start_mb_row) {
-  volatile const int *last_row_current_mb_col;
-  volatile int *current_mb_col;
+  const vpx_atomic_int *last_row_current_mb_col;
+  vpx_atomic_int *current_mb_col;
   int mb_row;
   VP8_COMMON *pc = &pbi->common;
   const int nsync = pbi->sync_range;
-  const int first_row_no_sync_above = pc->mb_cols + nsync;
+  const vpx_atomic_int first_row_no_sync_above =
+      VPX_ATOMIC_INIT(pc->mb_cols + nsync);
   int num_part = 1 << pbi->common.multi_token_partition;
   int last_mb_row = start_mb_row;
 
@@ -289,6 +287,9 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
   xd->up_available = (start_mb_row != 0);
 
+  xd->mode_info_context = pc->mi + pc->mode_info_stride * start_mb_row;
+  xd->mode_info_stride = pc->mode_info_stride;
+
   for (mb_row = start_mb_row; mb_row < pc->mb_rows;
        mb_row += (pbi->decoding_thread_count + 1)) {
     int recon_yoffset, recon_uvoffset;
@@ -318,7 +319,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
     xd->left_available = 0;
 
-    xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+    xd->mb_to_top_edge = -((mb_row * 16) << 3);
     xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
 
     if (pbi->common.filter_level) {
@@ -355,14 +356,13 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
                              xd->dst.uv_stride);
     }
 
-    for (mb_col = 0; mb_col < pc->mb_cols; mb_col++) {
-      *current_mb_col = mb_col - 1;
+    for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) {
+      if (((mb_col - 1) % nsync) == 0) {
+        vpx_atomic_store_release(current_mb_col, mb_col - 1);
+      }
 
-      if ((mb_col & (nsync - 1)) == 0) {
-        while (mb_col > (*last_row_current_mb_col - nsync)) {
-          x86_pause_hint();
-          thread_sleep(1);
-        }
+      if (mb_row && !(mb_col & (nsync - 1))) {
+        vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync);
       }
 
       /* Distance of MB to the various image edges.
@@ -548,7 +548,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
     }
 
     /* last MB of row is ready just after extension is done */
-    *current_mb_col = mb_col + nsync;
+    vpx_atomic_store_release(current_mb_col, mb_col + nsync);
 
     ++xd->mode_info_context; /* skip prediction column */
     xd->up_available = 1;
@@ -568,10 +568,10 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
   ENTROPY_CONTEXT_PLANES mb_row_left_context;
 
   while (1) {
-    if (pbi->b_multithreaded_rd == 0) break;
+    if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break;
 
     if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
-      if (pbi->b_multithreaded_rd == 0) {
+      if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) {
         break;
       } else {
         MACROBLOCKD *xd = &mbrd->mbd;
@@ -589,7 +589,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
   int core_count = 0;
   unsigned int ithread;
 
-  pbi->b_multithreaded_rd = 0;
+  vpx_atomic_init(&pbi->b_multithreaded_rd, 0);
   pbi->allocated_decoding_thread_count = 0;
 
   /* limit decoding threads to the max number of token partitions */
@@ -601,7 +601,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
   }
 
   if (core_count > 1) {
-    pbi->b_multithreaded_rd = 1;
+    vpx_atomic_init(&pbi->b_multithreaded_rd, 1);
     pbi->decoding_thread_count = core_count - 1;
 
     CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count);
@@ -712,7 +712,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
   int i;
   int uv_width;
 
-  if (pbi->b_multithreaded_rd) {
+  if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
     vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
 
     /* our internal buffers are always multiples of 16 */
@@ -730,27 +730,33 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
 
     uv_width = width >> 1;
 
-    /* Allocate an int for each mb row. */
-    CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows);
+    /* Allocate a vpx_atomic_int for each mb row. */
+    CHECK_MEM_ERROR(pbi->mt_current_mb_col,
+                    vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows));
+    for (i = 0; i < pc->mb_rows; ++i)
+      vpx_atomic_init(&pbi->mt_current_mb_col[i], 0);
 
     /* Allocate memory for above_row buffers. */
     CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
-                      vpx_memalign(16, sizeof(unsigned char) *
-                                           (width + (VP8BORDERINPIXELS << 1))));
+      CHECK_MEM_ERROR(
+          pbi->mt_yabove_row[i],
+          vpx_memalign(
+              16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1))));
 
     CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
-                      vpx_memalign(16, sizeof(unsigned char) *
-                                           (uv_width + VP8BORDERINPIXELS)));
+      CHECK_MEM_ERROR(
+          pbi->mt_uabove_row[i],
+          vpx_memalign(16,
+                       sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
 
     CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
-                      vpx_memalign(16, sizeof(unsigned char) *
-                                           (uv_width + VP8BORDERINPIXELS)));
+      CHECK_MEM_ERROR(
+          pbi->mt_vabove_row[i],
+          vpx_memalign(16,
+                       sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
 
     /* Allocate memory for left_col buffers. */
     CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
@@ -772,9 +778,9 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
 
 void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
   /* shutdown MB Decoding thread; */
-  if (pbi->b_multithreaded_rd) {
+  if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
     int i;
-    pbi->b_multithreaded_rd = 0;
+    vpx_atomic_store_release(&pbi->b_multithreaded_rd, 0);
 
     /* allow all threads to exit */
     for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
diff --git a/libs/libvpx/vp8/encoder/bitstream.c b/libs/libvpx/vp8/encoder/bitstream.c
index 1b100cfe8b..8cacb64505 100644
--- a/libs/libvpx/vp8/encoder/bitstream.c
+++ b/libs/libvpx/vp8/encoder/bitstream.c
@@ -19,6 +19,7 @@
 #include <limits.h>
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/system_state.h"
 #include "bitstream.h"
 
 #include "defaultcoefcounts.h"
@@ -499,8 +500,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
         }
 
         write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob);
-      } else /* inter coded */
-      {
+      } else { /* inter coded */
         int_mv best_mv;
         vp8_prob mv_ref_p[VP8_MVREFS - 1];
 
@@ -843,7 +843,7 @@ int vp8_estimate_entropy_savings(VP8_COMP *cpi) {
   int new_intra, new_last, new_garf, oldtotal, newtotal;
   int ref_frame_cost[MAX_REF_FRAMES];
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if (cpi->common.frame_type != KEY_FRAME) {
     if (!(new_intra = rf_intra * 255 / (rf_intra + rf_inter))) new_intra = 1;
@@ -908,7 +908,7 @@ void vp8_update_coef_probs(VP8_COMP *cpi) {
 #endif
   int savings = 0;
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   do {
     int j = 0;
@@ -1295,7 +1295,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
 
 #endif
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
   pack_coef_probs(cpi);
@@ -1415,7 +1415,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
     vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end);
 
 #if CONFIG_MULTITHREAD
-    if (cpi->b_multi_threaded) {
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
       pack_mb_row_tokens(cpi, &cpi->bc[1]);
     } else {
       vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
diff --git a/libs/libvpx/vp8/encoder/bitstream.h b/libs/libvpx/vp8/encoder/bitstream.h
index 2b196dcd27..ed45bff9e2 100644
--- a/libs/libvpx/vp8/encoder/bitstream.h
+++ b/libs/libvpx/vp8/encoder/bitstream.h
@@ -15,7 +15,15 @@
 extern "C" {
 #endif
 
+#include "vp8/encoder/treewriter.h"
+#include "vp8/encoder/tokenize.h"
+
 void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount);
+void vp8_convert_rfct_to_prob(struct VP8_COMP *const cpi);
+void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra,
+                              int prob_last, int prob_garf);
+int vp8_estimate_entropy_savings(struct VP8_COMP *cpi);
+void vp8_update_coef_probs(struct VP8_COMP *cpi);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libs/libvpx/vp8/encoder/encodeframe.c b/libs/libvpx/vp8/encoder/encodeframe.c
index 600ba4061b..9bb0df72d5 100644
--- a/libs/libvpx/vp8/encoder/encodeframe.c
+++ b/libs/libvpx/vp8/encoder/encodeframe.c
@@ -11,8 +11,12 @@
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
+#include "bitstream.h"
 #include "encodemb.h"
 #include "encodemv.h"
+#if CONFIG_MULTITHREAD
+#include "ethreading.h"
+#endif
 #include "vp8/common/common.h"
 #include "onyx_int.h"
 #include "vp8/common/extend.h"
@@ -35,13 +39,6 @@
 #include "encodeframe.h"
 
 extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
-extern void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra,
-                                     int prob_last, int prob_garf);
-extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi);
-extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
-extern void vp8_auto_select_speed(VP8_COMP *cpi);
-extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
-                                      MB_ROW_COMP *mbr_ei, int count);
 static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
 
 #ifdef MODE_STATS
@@ -344,11 +341,11 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
 
 #if CONFIG_MULTITHREAD
   const int nsync = cpi->mt_sync_range;
-  const int rightmost_col = cm->mb_cols + nsync;
-  volatile const int *last_row_current_mb_col;
-  volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+  vpx_atomic_int rightmost_col = VPX_ATOMIC_INIT(cm->mb_cols + nsync);
+  const vpx_atomic_int *last_row_current_mb_col;
+  vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
 
-  if ((cpi->b_multi_threaded != 0) && (mb_row != 0)) {
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0 && mb_row != 0) {
     last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
   } else {
     last_row_current_mb_col = &rightmost_col;
@@ -418,14 +415,13 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
     vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
 #if CONFIG_MULTITHREAD
-    if (cpi->b_multi_threaded != 0) {
-      *current_mb_col = mb_col - 1; /* set previous MB done */
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) {
+      if (((mb_col - 1) % nsync) == 0) {
+        vpx_atomic_store_release(current_mb_col, mb_col - 1);
+      }
 
-      if ((mb_col & (nsync - 1)) == 0) {
-        while (mb_col > (*last_row_current_mb_col - nsync)) {
-          x86_pause_hint();
-          thread_sleep(1);
-        }
+      if (mb_row && !(mb_col & (nsync - 1))) {
+        vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync);
       }
     }
 #endif
@@ -565,7 +561,10 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
                     xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 
 #if CONFIG_MULTITHREAD
-  if (cpi->b_multi_threaded != 0) *current_mb_col = rightmost_col;
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) {
+    vpx_atomic_store_release(current_mb_col,
+                             vpx_atomic_load_acquire(&rightmost_col));
+  }
 #endif
 
   /* this is to account for the border */
@@ -749,13 +748,14 @@ void vp8_encode_frame(VP8_COMP *cpi) {
     vpx_usec_timer_start(&emr_timer);
 
 #if CONFIG_MULTITHREAD
-    if (cpi->b_multi_threaded) {
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
       int i;
 
       vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei,
                                 cpi->encoding_thread_count);
 
-      for (i = 0; i < cm->mb_rows; ++i) cpi->mt_current_mb_col[i] = -1;
+      for (i = 0; i < cm->mb_rows; ++i)
+        vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1);
 
       for (i = 0; i < cpi->encoding_thread_count; ++i) {
         sem_post(&cpi->h_event_start_encoding[i]);
diff --git a/libs/libvpx/vp8/encoder/encodeframe.h b/libs/libvpx/vp8/encoder/encodeframe.h
index c1d8634927..5274aba412 100644
--- a/libs/libvpx/vp8/encoder/encodeframe.h
+++ b/libs/libvpx/vp8/encoder/encodeframe.h
@@ -10,24 +10,29 @@
 #ifndef VP8_ENCODER_ENCODEFRAME_H_
 #define VP8_ENCODER_ENCODEFRAME_H_
 
+#include "vp8/encoder/tokenize.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
-extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
 
-extern void vp8_build_block_offsets(MACROBLOCK *x);
+struct VP8_COMP;
+struct macroblock;
 
-extern void vp8_setup_block_ptrs(MACROBLOCK *x);
+void vp8_activity_masking(struct VP8_COMP *cpi, MACROBLOCK *x);
 
-extern void vp8_encode_frame(VP8_COMP *cpi);
+void vp8_build_block_offsets(struct macroblock *x);
 
-extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                         TOKENEXTRA **t, int recon_yoffset,
-                                         int recon_uvoffset, int mb_row,
-                                         int mb_col);
+void vp8_setup_block_ptrs(struct macroblock *x);
 
-extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                         TOKENEXTRA **t);
+void vp8_encode_frame(struct VP8_COMP *cpi);
+
+int vp8cx_encode_inter_macroblock(struct VP8_COMP *cpi, struct macroblock *x,
+                                  TOKENEXTRA **t, int recon_yoffset,
+                                  int recon_uvoffset, int mb_row, int mb_col);
+
+int vp8cx_encode_intra_macroblock(struct VP8_COMP *cpi, struct macroblock *x,
+                                  TOKENEXTRA **t);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libs/libvpx/vp8/encoder/encodemv.c b/libs/libvpx/vp8/encoder/encodemv.c
index cfc7af663f..ea93ccd710 100644
--- a/libs/libvpx/vp8/encoder/encodemv.c
+++ b/libs/libvpx/vp8/encoder/encodemv.c
@@ -12,6 +12,7 @@
 #include "encodemv.h"
 #include "vp8/common/entropymode.h"
 #include "vp8/common/systemdependent.h"
+#include "vpx_ports/system_state.h"
 
 #include <math.h>
 
@@ -24,14 +25,12 @@ static void encode_mvcomponent(vp8_writer *const w, const int v,
   const vp8_prob *p = mvc->prob;
   const int x = v < 0 ? -v : v;
 
-  if (x < mvnum_short) /* Small */
-  {
+  if (x < mvnum_short) { /* Small */
     vp8_write(w, 0, p[mvpis_short]);
     vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3);
 
     if (!x) return; /* no sign bit */
-  } else            /* Large */
-  {
+  } else {          /* Large */
     int i = 0;
 
     vp8_write(w, 1, p[mvpis_short]);
@@ -126,7 +125,7 @@ void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc,
   unsigned int cost0 = 0;
   unsigned int cost1 = 0;
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   i = 1;
 
diff --git a/libs/libvpx/vp8/encoder/ethreading.c b/libs/libvpx/vp8/encoder/ethreading.c
index 68ca0df08d..55a1528b14 100644
--- a/libs/libvpx/vp8/encoder/ethreading.c
+++ b/libs/libvpx/vp8/encoder/ethreading.c
@@ -14,6 +14,7 @@
 #include "vp8/common/extend.h"
 #include "bitstream.h"
 #include "encodeframe.h"
+#include "ethreading.h"
 
 #if CONFIG_MULTITHREAD
 
@@ -25,11 +26,11 @@ static THREAD_FUNCTION thread_loopfilter(void *p_data) {
   VP8_COMMON *cm = &cpi->common;
 
   while (1) {
-    if (cpi->b_multi_threaded == 0) break;
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
     if (sem_wait(&cpi->h_event_start_lpf) == 0) {
       /* we're shutting down */
-      if (cpi->b_multi_threaded == 0) break;
+      if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
       vp8_loopfilter_frame(cpi, cm);
 
@@ -47,7 +48,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
   ENTROPY_CONTEXT_PLANES mb_row_left_context;
 
   while (1) {
-    if (cpi->b_multi_threaded == 0) break;
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
     if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
       const int nsync = cpi->mt_sync_range;
@@ -65,7 +66,10 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
       int *totalrate = &mbri->totalrate;
 
       /* we're shutting down */
-      if (cpi->b_multi_threaded == 0) break;
+      if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
+
+      xd->mode_info_context = cm->mi + cm->mode_info_stride * (ithread + 1);
+      xd->mode_info_stride = cm->mode_info_stride;
 
       for (mb_row = ithread + 1; mb_row < cm->mb_rows;
            mb_row += (cpi->encoding_thread_count + 1)) {
@@ -76,8 +80,8 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
         int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
         int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
         int map_index = (mb_row * cm->mb_cols);
-        volatile const int *last_row_current_mb_col;
-        volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+        const vpx_atomic_int *last_row_current_mb_col;
+        vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
 
 #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
         vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)];
@@ -103,13 +107,12 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
 
         /* for each macroblock col in image */
         for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
-          *current_mb_col = mb_col - 1;
+          if (((mb_col - 1) % nsync) == 0) {
+            vpx_atomic_store_release(current_mb_col, mb_col - 1);
+          }
 
-          if ((mb_col & (nsync - 1)) == 0) {
-            while (mb_col > (*last_row_current_mb_col - nsync)) {
-              x86_pause_hint();
-              thread_sleep(1);
-            }
+          if (mb_row && !(mb_col & (nsync - 1))) {
+            vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync);
           }
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
@@ -281,7 +284,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
         vp8_extend_mb_row(&cm->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16,
                           xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 
-        *current_mb_col = mb_col + nsync;
+        vpx_atomic_store_release(current_mb_col, mb_col + nsync);
 
         /* this is to account for the border */
         xd->mode_info_context++;
@@ -450,9 +453,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
 
     mb->partition_info = x->pi + x->e_mbd.mode_info_stride * (i + 1);
 
-    mbd->mode_info_context = cm->mi + x->e_mbd.mode_info_stride * (i + 1);
-    mbd->mode_info_stride = cm->mode_info_stride;
-
     mbd->frame_type = cm->frame_type;
 
     mb->src = *cpi->Source;
@@ -488,7 +488,7 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
 int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
   const VP8_COMMON *cm = &cpi->common;
 
-  cpi->b_multi_threaded = 0;
+  vpx_atomic_init(&cpi->b_multi_threaded, 0);
   cpi->encoding_thread_count = 0;
   cpi->b_lpf_running = 0;
 
@@ -522,7 +522,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
     CHECK_MEM_ERROR(cpi->en_thread_data,
                     vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
 
-    cpi->b_multi_threaded = 1;
+    vpx_atomic_store_release(&cpi->b_multi_threaded, 1);
     cpi->encoding_thread_count = th_count;
 
     /*
@@ -551,7 +551,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 
     if (rc) {
       /* shutdown other threads */
-      cpi->b_multi_threaded = 0;
+      vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
       for (--ithread; ithread >= 0; ithread--) {
         pthread_join(cpi->h_encoding_thread[ithread], 0);
         sem_destroy(&cpi->h_event_start_encoding[ithread]);
@@ -579,7 +579,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 
       if (rc) {
         /* shutdown other threads */
-        cpi->b_multi_threaded = 0;
+        vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
         for (--ithread; ithread >= 0; ithread--) {
           sem_post(&cpi->h_event_start_encoding[ithread]);
           sem_post(&cpi->h_event_end_encoding[ithread]);
@@ -605,9 +605,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 }
 
 void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
-  if (cpi->b_multi_threaded) {
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
     /* shutdown other threads */
-    cpi->b_multi_threaded = 0;
+    vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
     {
       int i;
 
diff --git a/libs/libvpx/vp8/encoder/ethreading.h b/libs/libvpx/vp8/encoder/ethreading.h
new file mode 100644
index 0000000000..95bf73d182
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/ethreading.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_ETHREADING_H_
+#define VP8_ENCODER_ETHREADING_H_
+
+#include "vp8/encoder/onyx_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+struct macroblock;
+
+void vp8cx_init_mbrthread_data(struct VP8_COMP *cpi, struct macroblock *x,
+                               MB_ROW_COMP *mbr_ei, int count);
+int vp8cx_create_encoder_threads(struct VP8_COMP *cpi);
+void vp8cx_remove_encoder_threads(struct VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP8_ENCODER_ETHREADING_H_
diff --git a/libs/libvpx/vp8/encoder/firstpass.c b/libs/libvpx/vp8/encoder/firstpass.c
index cd34e33fb8..70f9243410 100644
--- a/libs/libvpx/vp8/encoder/firstpass.c
+++ b/libs/libvpx/vp8/encoder/firstpass.c
@@ -26,6 +26,7 @@
 #include "vpx_scale/vpx_scale.h"
 #include "encodemb.h"
 #include "vp8/common/extend.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/swapyv12buffer.h"
 #include "rdopt.h"
@@ -499,7 +500,7 @@ void vp8_first_pass(VP8_COMP *cpi) {
 
   zero_ref_mv.as_int = 0;
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   x->src = *cpi->Source;
   xd->pre = *lst_yv12;
@@ -741,10 +742,10 @@ void vp8_first_pass(VP8_COMP *cpi) {
     /* extend the recon for intra prediction */
     vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16, xd->dst.u_buffer + 8,
                       xd->dst.v_buffer + 8);
-    vp8_clear_system_state();
+    vpx_clear_system_state();
   }
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
   {
     double weight = 0.0;
 
@@ -1184,9 +1185,9 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err,
     current_spend_ratio = (double)cpi->long_rolling_actual_bits /
                           (double)cpi->long_rolling_target_bits;
     current_spend_ratio =
-        (current_spend_ratio > 10.0) ? 10.0 : (current_spend_ratio < 0.1)
-                                                  ? 0.1
-                                                  : current_spend_ratio;
+        (current_spend_ratio > 10.0)
+            ? 10.0
+            : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio;
   }
 
   /* Calculate a correction factor based on the quality of prediction in
@@ -1272,8 +1273,9 @@ void vp8_init_second_pass(VP8_COMP *cpi) {
    * sum duration is not. Its calculated based on the actual durations of
    * all frames from the first pass.
    */
-  vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
-                             cpi->twopass.total_stats.duration);
+  vp8_new_framerate(cpi,
+                    10000000.0 * cpi->twopass.total_stats.count /
+                        cpi->twopass.total_stats.duration);
 
   cpi->output_framerate = cpi->framerate;
   cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
@@ -1655,7 +1657,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   cpi->twopass.gf_group_bits = 0;
   cpi->twopass.gf_decay_rate = 0;
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   start_pos = cpi->twopass.stats_in;
 
@@ -2268,7 +2270,7 @@ void vp8_second_pass(VP8_COMP *cpi) {
     return;
   }
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if (EOF == input_stats(cpi, &this_frame)) return;
 
@@ -2543,7 +2545,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   memset(&next_frame, 0, sizeof(next_frame));
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
   start_position = cpi->twopass.stats_in;
 
   cpi->common.frame_type = KEY_FRAME;
diff --git a/libs/libvpx/vp8/encoder/mips/mmi/dct_mmi.c b/libs/libvpx/vp8/encoder/mips/mmi/dct_mmi.c
new file mode 100644
index 0000000000..1f60a692d0
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/mips/mmi/dct_mmi.c
@@ -0,0 +1,425 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+/* clang-format off */
+/* TRANSPOSE_4H: transpose 4x4 matrix.
+   Input: ftmp1,ftmp2,ftmp3,ftmp4
+   Output: ftmp1,ftmp2,ftmp3,ftmp4
+   Note: ftmp0 always be 0, ftmp5~9 used for temporary value.
+ */
+#define TRANSPOSE_4H                                         \
+  MMI_LI(%[tmp0], 0x93)                                      \
+  "mtc1       %[tmp0],    %[ftmp10]                    \n\t" \
+  "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp0]         \n\t" \
+  "punpcklhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "or         %[ftmp5],   %[ftmp5],   %[ftmp9]         \n\t" \
+  "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp0]         \n\t" \
+  "punpckhhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "or         %[ftmp6],   %[ftmp6],   %[ftmp9]         \n\t" \
+  "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp0]         \n\t" \
+  "punpcklhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "or         %[ftmp7],   %[ftmp7],   %[ftmp9]         \n\t" \
+  "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp0]         \n\t" \
+  "punpckhhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "or         %[ftmp8],   %[ftmp8],   %[ftmp9]         \n\t" \
+  "punpcklwd  %[ftmp1],   %[ftmp5],   %[ftmp7]         \n\t" \
+  "punpckhwd  %[ftmp2],   %[ftmp5],   %[ftmp7]         \n\t" \
+  "punpcklwd  %[ftmp3],   %[ftmp6],   %[ftmp8]         \n\t" \
+  "punpckhwd  %[ftmp4],   %[ftmp6],   %[ftmp8]         \n\t"
+/* clang-format on */
+
+void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
+  uint64_t tmp[1];
+  int16_t *ip = input;
+
+#if _MIPS_SIM == _ABIO32
+  register double ftmp0 asm("$f0");
+  register double ftmp1 asm("$f2");
+  register double ftmp2 asm("$f4");
+  register double ftmp3 asm("$f6");
+  register double ftmp4 asm("$f8");
+  register double ftmp5 asm("$f10");
+  register double ftmp6 asm("$f12");
+  register double ftmp7 asm("$f14");
+  register double ftmp8 asm("$f16");
+  register double ftmp9 asm("$f18");
+  register double ftmp10 asm("$f20");
+  register double ftmp11 asm("$f22");
+  register double ftmp12 asm("$f24");
+#else
+  register double ftmp0 asm("$f0");
+  register double ftmp1 asm("$f1");
+  register double ftmp2 asm("$f2");
+  register double ftmp3 asm("$f3");
+  register double ftmp4 asm("$f4");
+  register double ftmp5 asm("$f5");
+  register double ftmp6 asm("$f6");
+  register double ftmp7 asm("$f7");
+  register double ftmp8 asm("$f8");
+  register double ftmp9 asm("$f9");
+  register double ftmp10 asm("$f10");
+  register double ftmp11 asm("$f11");
+  register double ftmp12 asm("$f12");
+#endif  // _MIPS_SIM == _ABIO32
+
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };
+
+  __asm__ volatile (
+    "xor        %[ftmp0],   %[ftmp0],      %[ftmp0]         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp2],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp3],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp4],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    TRANSPOSE_4H
+
+    "ldc1       %[ftmp11],  %[ff_ph_8]                      \n\t"
+    // f1 + f4
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
+    // a1
+    "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
+    // f2 + f3
+    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
+    // b1
+    "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
+    // f2 - f3
+    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
+    // c1
+    "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp11]       \n\t"
+    // f1 - f4
+    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp4]        \n\t"
+    // d1
+    "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp11]       \n\t"
+    // op[0] = a1 + b1
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
+    // op[2] = a1 - b1
+    "psubh      %[ftmp3],   %[ftmp5],       %[ftmp6]        \n\t"
+
+    // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
+    MMI_LI(%[tmp0], 0x0c)
+    "mtc1       %[tmp0],    %[ftmp11]                       \n\t"
+    "ldc1       %[ftmp12],  %[ff_pw_14500]                  \n\t"
+    "punpcklhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op1]    \n\t"
+    "punpckhhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp6],   %[ftmp9],       %[ff_ph_op1]    \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp12]       \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp12]       \n\t"
+    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
+    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
+    "packsswh   %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
+
+    // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12
+    "ldc1       %[ftmp12],  %[ff_pw_7500]                   \n\t"
+    "punpcklhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op3]    \n\t"
+    "punpckhhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp6],   %[ftmp9],       %[ff_ph_op3]    \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp12]       \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp12]       \n\t"
+    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
+    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
+    "packsswh   %[ftmp4],   %[ftmp5],       %[ftmp6]        \n\t"
+    TRANSPOSE_4H
+
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
+    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
+    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
+    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp4]        \n\t"
+
+    "pcmpeqh    %[ftmp0],   %[ftmp8],       %[ftmp0]        \n\t"
+    "ldc1       %[ftmp9],   %[ff_ph_01]                     \n\t"
+    "paddh      %[ftmp0],   %[ftmp0],       %[ftmp9]        \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
+    "psubh      %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
+    "ldc1       %[ftmp9],   %[ff_ph_07]                     \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
+    MMI_LI(%[tmp0], 0x04)
+    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
+    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
+
+    MMI_LI(%[tmp0], 0x10)
+    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+    "ldc1       %[ftmp12],  %[ff_pw_12000]                  \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op1]    \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp11],  %[ftmp5],       %[ff_ph_op1]    \n\t"
+    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp12]       \n\t"
+    "paddw      %[ftmp11],  %[ftmp11],      %[ftmp12]       \n\t"
+    "psraw      %[ftmp10],  %[ftmp10],      %[ftmp9]        \n\t"
+    "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
+    "packsswh   %[ftmp3],   %[ftmp10],      %[ftmp11]       \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]        \n\t"
+
+    "ldc1       %[ftmp12],  %[ff_pw_51000]                  \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op3]    \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp11],  %[ftmp5],       %[ff_ph_op3]    \n\t"
+    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp12]       \n\t"
+    "paddw      %[ftmp11],  %[ftmp11],      %[ftmp12]       \n\t"
+    "psraw      %[ftmp10],  %[ftmp10],      %[ftmp9]        \n\t"
+    "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
+    "packsswh   %[ftmp4],   %[ftmp10],      %[ftmp11]       \n\t"
+
+    "gssdlc1    %[ftmp1],   0x07(%[output])                 \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[output])                 \n\t"
+    "gssdlc1    %[ftmp3],   0x0f(%[output])                 \n\t"
+    "gssdrc1    %[ftmp3],   0x08(%[output])                 \n\t"
+    "gssdlc1    %[ftmp2],   0x17(%[output])                 \n\t"
+    "gssdrc1    %[ftmp2],   0x10(%[output])                 \n\t"
+    "gssdlc1    %[ftmp4],   0x1f(%[output])                 \n\t"
+    "gssdrc1    %[ftmp4],   0x18(%[output])                 \n\t"
+
+    : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2),
+      [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
+      [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
+      [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
+      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip)
+    : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
+      [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3),
+      [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
+      [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
+      [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
+      [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
+    : "memory"
+  );
+}
+
+void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
+  vp8_short_fdct4x4_mmi(input, output, pitch);
+  vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch);
+}
+
+void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
+  double ftmp[13];
+  uint32_t tmp[1];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL };
+
+  __asm__ volatile (
+    MMI_LI(%[tmp0], 0x02)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp2],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ip])                         \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp3],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[ip])                         \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp4],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[ip])                         \n\t"
+    TRANSPOSE_4H
+
+    "psllh      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    "psllh      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    "psllh      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    "psllh      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+    // a
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
+    // d
+    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp4]            \n\t"
+    // c
+    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp4]            \n\t"
+    // b
+    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp3]            \n\t"
+
+    // a + d
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
+    // b + c
+    "paddh      %[ftmp2],   %[ftmp8],       %[ftmp7]            \n\t"
+    // b - c
+    "psubh      %[ftmp3],   %[ftmp8],       %[ftmp7]            \n\t"
+    // a - d
+    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp6]            \n\t"
+
+    "pcmpeqh    %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp6],   %[ftmp6],       %[ff_ph_01]         \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+    TRANSPOSE_4H
+
+    // op[2], op[0]
+    "pmaddhw    %[ftmp5],   %[ftmp1],       %[ff_pw_01]         \n\t"
+    // op[3], op[1]
+    "pmaddhw    %[ftmp1],   %[ftmp1],       %[ff_pw_mask]       \n\t"
+
+    // op[6], op[4]
+    "pmaddhw    %[ftmp6],   %[ftmp2],       %[ff_pw_01]         \n\t"
+    // op[7], op[5]
+    "pmaddhw    %[ftmp2],   %[ftmp2],       %[ff_pw_mask]       \n\t"
+
+    // op[10], op[8]
+    "pmaddhw    %[ftmp7],   %[ftmp3],       %[ff_pw_01]         \n\t"
+    // op[11], op[9]
+    "pmaddhw    %[ftmp3],   %[ftmp3],       %[ff_pw_mask]       \n\t"
+
+    // op[14], op[12]
+    "pmaddhw    %[ftmp8],   %[ftmp4],       %[ff_pw_01]         \n\t"
+    // op[15], op[13]
+    "pmaddhw    %[ftmp4],   %[ftmp4],       %[ff_pw_mask]       \n\t"
+
+    // a1, a3
+    "paddw      %[ftmp9],   %[ftmp5],       %[ftmp7]            \n\t"
+    // d1, d3
+    "paddw      %[ftmp10],  %[ftmp6],       %[ftmp8]            \n\t"
+    // c1, c3
+    "psubw      %[ftmp11],  %[ftmp6],       %[ftmp8]            \n\t"
+    // b1, b3
+    "psubw      %[ftmp12],  %[ftmp5],       %[ftmp7]            \n\t"
+
+    // a1 + d1, a3 + d3
+    "paddw      %[ftmp5],   %[ftmp9],       %[ftmp10]           \n\t"
+    // b1 + c1, b3 + c3
+    "paddw      %[ftmp6],   %[ftmp12],      %[ftmp11]           \n\t"
+    // b1 - c1, b3 - c3
+    "psubw      %[ftmp7],   %[ftmp12],      %[ftmp11]           \n\t"
+    // a1 - d1, a3 - d3
+    "psubw      %[ftmp8],   %[ftmp9],       %[ftmp10]           \n\t"
+
+    // a2, a4
+    "paddw      %[ftmp9],   %[ftmp1],       %[ftmp3]            \n\t"
+    // d2, d4
+    "paddw      %[ftmp10],  %[ftmp2],       %[ftmp4]            \n\t"
+    // c2, c4
+    "psubw      %[ftmp11],  %[ftmp2],       %[ftmp4]            \n\t"
+    // b2, b4
+    "psubw      %[ftmp12],  %[ftmp1],       %[ftmp3]            \n\t"
+
+    // a2 + d2, a4 + d4
+    "paddw      %[ftmp1],   %[ftmp9],       %[ftmp10]           \n\t"
+    // b2 + c2, b4 + c4
+    "paddw      %[ftmp2],   %[ftmp12],      %[ftmp11]           \n\t"
+    // b2 - c2, b4 - c4
+    "psubw      %[ftmp3],   %[ftmp12],      %[ftmp11]           \n\t"
+    // a2 - d2, a4 - d4
+    "psubw      %[ftmp4],   %[ftmp9],       %[ftmp10]           \n\t"
+
+    MMI_LI(%[tmp0], 0x03)
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp1]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp1],   %[ftmp1],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp2]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp2],   %[ftmp2],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp3]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp4]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp4],   %[ftmp4],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp5]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp6]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp7]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp7],   %[ftmp7],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp7],   %[ftmp7],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp7],   %[ftmp7],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp8]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp8],   %[ftmp8],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp8],   %[ftmp8],       %[ftmp11]           \n\t"
+
+    "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+    "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+    "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+    "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
+
+    MMI_LI(%[tmp0], 0x72)
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    "gssdlc1    %[ftmp1],   0x07(%[op])                         \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[op])                         \n\t"
+    "gssdlc1    %[ftmp2],   0x0f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp2],   0x08(%[op])                         \n\t"
+    "gssdlc1    %[ftmp3],   0x17(%[op])                         \n\t"
+    "gssdrc1    %[ftmp3],   0x10(%[op])                         \n\t"
+    "gssdlc1    %[ftmp4],   0x1f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp4],   0x18(%[op])                         \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),
+      [tmp0]"=&r"(tmp[0]),
+      [ip]"+&r"(input)
+    : [op]"r"(output),
+      [ff_pw_01]"f"(ff_pw_01),          [pitch]"r"((mips_reg)pitch),
+      [ff_pw_03]"f"(ff_pw_03),          [ff_pw_mask]"f"(ff_pw_mask),
+      [ff_ph_01]"f"(ff_ph_01)
+    : "memory"
+  );
+}
diff --git a/libs/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/libs/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
new file mode 100644
index 0000000000..3ccb196ffb
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
@@ -0,0 +1,262 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/quant_common.h"
+
+#define REGULAR_SELECT_EOB(i, rc)                                        \
+  z = coeff_ptr[rc];                                                     \
+  sz = (z >> 31);                                                        \
+  x = (z ^ sz) - sz;                                                     \
+  zbin = zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value;             \
+  if (x >= zbin) {                                                       \
+    x += round_ptr[rc];                                                  \
+    y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; \
+    if (y) {                                                             \
+      x = (y ^ sz) - sz;                                                 \
+      qcoeff_ptr[rc] = x;                                                \
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];                             \
+      eob = i;                                                           \
+      zbin_boost_ptr = b->zrun_zbin_boost;                               \
+    }                                                                    \
+  }
+
+void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
+  const int16_t *coeff_ptr = b->coeff;
+  const int16_t *round_ptr = b->round;
+  const int16_t *quant_ptr = b->quant_fast;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  const int16_t *dequant_ptr = d->dequant;
+  const int16_t *inv_zig_zag = vp8_default_inv_zig_zag;
+
+  double ftmp[13];
+  uint64_t tmp[1];
+  DECLARE_ALIGNED(8, const uint64_t, ones) = { 0xffffffffffffffffULL };
+  int eob = 0;
+
+  __asm__ volatile(
+      // loop 0 ~ 7
+      "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+      "gsldlc1    %[ftmp1],   0x07(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp1],   0x00(%[coeff_ptr])              \n\t"
+      "li         %[tmp0],    0x0f                            \n\t"
+      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "gsldlc1    %[ftmp2],   0x0f(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp2],   0x08(%[coeff_ptr])              \n\t"
+
+      "psrah      %[ftmp3],   %[ftmp1],       %[ftmp9]        \n\t"
+      "xor        %[ftmp1],   %[ftmp3],       %[ftmp1]        \n\t"
+      "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]        \n\t"
+      "psrah      %[ftmp4],   %[ftmp2],       %[ftmp9]        \n\t"
+      "xor        %[ftmp2],   %[ftmp4],       %[ftmp2]        \n\t"
+      "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x07(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp5],   0x00(%[round_ptr])              \n\t"
+      "gsldlc1    %[ftmp6],   0x0f(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp6],   0x08(%[round_ptr])              \n\t"
+      "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "gsldlc1    %[ftmp7],   0x07(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp7],   0x00(%[quant_ptr])              \n\t"
+      "gsldlc1    %[ftmp8],   0x0f(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp8],   0x08(%[quant_ptr])              \n\t"
+      "pmulhuh    %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmulhuh    %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+
+      "xor        %[ftmp7],   %[ftmp5],       %[ftmp3]        \n\t"
+      "xor        %[ftmp8],   %[ftmp6],       %[ftmp4]        \n\t"
+      "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]        \n\t"
+      "psubh      %[ftmp8],   %[ftmp8],       %[ftmp4]        \n\t"
+      "gssdlc1    %[ftmp7],   0x07(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp7],   0x00(%[qcoeff_ptr])             \n\t"
+      "gssdlc1    %[ftmp8],   0x0f(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp8],   0x08(%[qcoeff_ptr])             \n\t"
+
+      "gsldlc1    %[ftmp1],   0x07(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp1],   0x00(%[inv_zig_zag])            \n\t"
+      "gsldlc1    %[ftmp2],   0x0f(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp2],   0x08(%[inv_zig_zag])            \n\t"
+      "pcmpeqh    %[ftmp5],   %[ftmp5],       %[ftmp0]        \n\t"
+      "pcmpeqh    %[ftmp6],   %[ftmp6],       %[ftmp0]        \n\t"
+      "xor        %[ftmp5],   %[ftmp5],       %[ones]         \n\t"
+      "xor        %[ftmp6],   %[ftmp6],       %[ones]         \n\t"
+      "and        %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "and        %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "pmaxsh     %[ftmp10],  %[ftmp5],       %[ftmp6]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x07(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp5],   0x00(%[dequant_ptr])            \n\t"
+      "gsldlc1    %[ftmp6],   0x0f(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp6],   0x08(%[dequant_ptr])            \n\t"
+      "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+      "gssdlc1    %[ftmp5],   0x07(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp5],   0x00(%[dqcoeff_ptr])            \n\t"
+      "gssdlc1    %[ftmp6],   0x0f(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp6],   0x08(%[dqcoeff_ptr])            \n\t"
+
+      // loop 8 ~ 15
+      "gsldlc1    %[ftmp1],   0x17(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp1],   0x10(%[coeff_ptr])              \n\t"
+      "gsldlc1    %[ftmp2],   0x1f(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp2],   0x18(%[coeff_ptr])              \n\t"
+
+      "psrah      %[ftmp3],   %[ftmp1],       %[ftmp9]        \n\t"
+      "xor        %[ftmp1],   %[ftmp3],       %[ftmp1]        \n\t"
+      "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]        \n\t"
+      "psrah      %[ftmp4],   %[ftmp2],       %[ftmp9]        \n\t"
+      "xor        %[ftmp2],   %[ftmp4],       %[ftmp2]        \n\t"
+      "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x17(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp5],   0x10(%[round_ptr])              \n\t"
+      "gsldlc1    %[ftmp6],   0x1f(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp6],   0x18(%[round_ptr])              \n\t"
+      "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "gsldlc1    %[ftmp7],   0x17(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp7],   0x10(%[quant_ptr])              \n\t"
+      "gsldlc1    %[ftmp8],   0x1f(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp8],   0x18(%[quant_ptr])              \n\t"
+      "pmulhuh    %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmulhuh    %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+
+      "xor        %[ftmp7],   %[ftmp5],       %[ftmp3]        \n\t"
+      "xor        %[ftmp8],   %[ftmp6],       %[ftmp4]        \n\t"
+      "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]        \n\t"
+      "psubh      %[ftmp8],   %[ftmp8],       %[ftmp4]        \n\t"
+      "gssdlc1    %[ftmp7],   0x17(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp7],   0x10(%[qcoeff_ptr])             \n\t"
+      "gssdlc1    %[ftmp8],   0x1f(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp8],   0x18(%[qcoeff_ptr])             \n\t"
+
+      "gsldlc1    %[ftmp1],   0x17(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp1],   0x10(%[inv_zig_zag])            \n\t"
+      "gsldlc1    %[ftmp2],   0x1f(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp2],   0x18(%[inv_zig_zag])            \n\t"
+      "pcmpeqh    %[ftmp5],   %[ftmp5],       %[ftmp0]        \n\t"
+      "pcmpeqh    %[ftmp6],   %[ftmp6],       %[ftmp0]        \n\t"
+      "xor        %[ftmp5],   %[ftmp5],       %[ones]         \n\t"
+      "xor        %[ftmp6],   %[ftmp6],       %[ones]         \n\t"
+      "and        %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "and        %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "pmaxsh     %[ftmp11],  %[ftmp5],       %[ftmp6]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x17(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp5],   0x10(%[dequant_ptr])            \n\t"
+      "gsldlc1    %[ftmp6],   0x1f(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp6],   0x18(%[dequant_ptr])            \n\t"
+      "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+      "gssdlc1    %[ftmp5],   0x17(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp5],   0x10(%[dqcoeff_ptr])            \n\t"
+      "gssdlc1    %[ftmp6],   0x1f(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp6],   0x18(%[dqcoeff_ptr])            \n\t"
+
+      "li         %[tmp0],    0x10                            \n\t"
+      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+
+      "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
+      "psrlw      %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"
+      "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
+      "li         %[tmp0],    0xaa                            \n\t"
+      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "pshufh     %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"
+      "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
+      "li         %[tmp0],    0xffff                          \n\t"
+      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "and        %[ftmp10],  %[ftmp10],       %[ftmp9]       \n\t"
+      "gssdlc1    %[ftmp10],  0x07(%[eob])                    \n\t"
+      "gssdrc1    %[ftmp10],  0x00(%[eob])                    \n\t"
+      : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+        [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+        [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+        [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+        [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+      : [coeff_ptr] "r"((mips_reg)coeff_ptr),
+        [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr),
+        [dequant_ptr] "r"((mips_reg)dequant_ptr),
+        [round_ptr] "r"((mips_reg)round_ptr),
+        [quant_ptr] "r"((mips_reg)quant_ptr),
+        [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr),
+        [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob),
+        [ones] "f"(ones)
+      : "memory");
+
+  *d->eob = eob;
+}
+
+void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
+  int eob = 0;
+  int x, y, z, sz, zbin;
+  const int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+  const int16_t *coeff_ptr = b->coeff;
+  const int16_t *zbin_ptr = b->zbin;
+  const int16_t *round_ptr = b->round;
+  const int16_t *quant_ptr = b->quant;
+  const int16_t *quant_shift_ptr = b->quant_shift;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  const int16_t *dequant_ptr = d->dequant;
+  const int16_t zbin_oq_value = b->zbin_extra;
+  register double ftmp0 asm("$f0");
+
+  //  memset(qcoeff_ptr, 0, 32);
+  //  memset(dqcoeff_ptr, 0, 32);
+  /* clang-format off */
+  __asm__ volatile (
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+    "gssdlc1    %[ftmp0],   0x07(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[qcoeff_ptr])             \n\t"
+    "gssdlc1    %[ftmp0],   0x0f(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x08(%[qcoeff_ptr])             \n\t"
+    "gssdlc1    %[ftmp0],   0x17(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x10(%[qcoeff_ptr])             \n\t"
+    "gssdlc1    %[ftmp0],   0x1f(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x18(%[qcoeff_ptr])             \n\t"
+
+    "gssdlc1    %[ftmp0],   0x07(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[dqcoeff_ptr])            \n\t"
+    "gssdlc1    %[ftmp0],   0x0f(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x08(%[dqcoeff_ptr])            \n\t"
+    "gssdlc1    %[ftmp0],   0x17(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x10(%[dqcoeff_ptr])            \n\t"
+    "gssdlc1    %[ftmp0],   0x1f(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x18(%[dqcoeff_ptr])            \n\t"
+    : [ftmp0]"=&f"(ftmp0)
+    : [qcoeff_ptr]"r"(qcoeff_ptr), [dqcoeff_ptr]"r"(dqcoeff_ptr)
+    : "memory"
+  );
+  /* clang-format on */
+
+  REGULAR_SELECT_EOB(1, 0);
+  REGULAR_SELECT_EOB(2, 1);
+  REGULAR_SELECT_EOB(3, 4);
+  REGULAR_SELECT_EOB(4, 8);
+  REGULAR_SELECT_EOB(5, 5);
+  REGULAR_SELECT_EOB(6, 2);
+  REGULAR_SELECT_EOB(7, 3);
+  REGULAR_SELECT_EOB(8, 6);
+  REGULAR_SELECT_EOB(9, 9);
+  REGULAR_SELECT_EOB(10, 12);
+  REGULAR_SELECT_EOB(11, 13);
+  REGULAR_SELECT_EOB(12, 10);
+  REGULAR_SELECT_EOB(13, 7);
+  REGULAR_SELECT_EOB(14, 11);
+  REGULAR_SELECT_EOB(15, 14);
+  REGULAR_SELECT_EOB(16, 15);
+
+  *d->eob = (char)eob;
+}
diff --git a/libs/libvpx/vp8/encoder/mips/msa/quantize_msa.c b/libs/libvpx/vp8/encoder/mips/msa/quantize_msa.c
index 11f70ae82e..9f5fbd39c8 100644
--- a/libs/libvpx/vp8/encoder/mips/msa/quantize_msa.c
+++ b/libs/libvpx/vp8/encoder/mips/msa/quantize_msa.c
@@ -12,10 +12,9 @@
 #include "vp8/common/mips/msa/vp8_macros_msa.h"
 #include "vp8/encoder/block.h"
 
-static int8_t fast_quantize_b_msa(int16_t *coeff_ptr, int16_t *zbin,
-                                  int16_t *round, int16_t *quant,
-                                  int16_t *de_quant, int16_t *q_coeff,
-                                  int16_t *dq_coeff) {
+static int8_t fast_quantize_b_msa(int16_t *coeff_ptr, int16_t *round,
+                                  int16_t *quant, int16_t *de_quant,
+                                  int16_t *q_coeff, int16_t *dq_coeff) {
   int32_t cnt, eob;
   v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 };
   v8i16 round0, round1;
@@ -184,15 +183,14 @@ static int8_t exact_regular_quantize_b_msa(
 
 void vp8_fast_quantize_b_msa(BLOCK *b, BLOCKD *d) {
   int16_t *coeff_ptr = b->coeff;
-  int16_t *zbin_ptr = b->zbin;
   int16_t *round_ptr = b->round;
   int16_t *quant_ptr = b->quant_fast;
   int16_t *qcoeff_ptr = d->qcoeff;
   int16_t *dqcoeff_ptr = d->dqcoeff;
   int16_t *dequant_ptr = d->dequant;
 
-  *d->eob = fast_quantize_b_msa(coeff_ptr, zbin_ptr, round_ptr, quant_ptr,
-                                dequant_ptr, qcoeff_ptr, dqcoeff_ptr);
+  *d->eob = fast_quantize_b_msa(coeff_ptr, round_ptr, quant_ptr, dequant_ptr,
+                                qcoeff_ptr, dqcoeff_ptr);
 }
 
 void vp8_regular_quantize_b_msa(BLOCK *b, BLOCKD *d) {
diff --git a/libs/libvpx/vp8/encoder/onyx_if.c b/libs/libvpx/vp8/encoder/onyx_if.c
index 6ebf233edf..2243182425 100644
--- a/libs/libvpx/vp8/encoder/onyx_if.c
+++ b/libs/libvpx/vp8/encoder/onyx_if.c
@@ -12,10 +12,12 @@
 #include "./vpx_scale_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vp8_rtcd.h"
+#include "bitstream.h"
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/blockd.h"
 #include "onyx_int.h"
 #include "vp8/common/systemdependent.h"
+#include "vp8/common/vp8_skin_detection.h"
 #include "vp8/encoder/quantize.h"
 #include "vp8/common/alloccommon.h"
 #include "mcomp.h"
@@ -33,7 +35,9 @@
 #include "vp8/common/reconintra.h"
 #include "vp8/common/swapyv12buffer.h"
 #include "vp8/common/threading.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_ports/vpx_timer.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
 #if ARCH_ARM
 #include "vpx_ports/arm.h"
 #endif
@@ -41,35 +45,32 @@
 #include "mr_dissim.h"
 #endif
 #include "encodeframe.h"
+#if CONFIG_MULTITHREAD
+#include "ethreading.h"
+#endif
+#include "picklpf.h"
+#if !CONFIG_REALTIME_ONLY
+#include "temporal_filter.h"
+#endif
 
+#include <assert.h>
 #include <math.h>
 #include <stdio.h>
 #include <limits.h>
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
 extern int vp8_update_coef_context(VP8_COMP *cpi);
-extern void vp8_update_coef_probs(VP8_COMP *cpi);
 #endif
 
-extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
-extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val);
-extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
-
 extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source,
                               YV12_BUFFER_CONFIG *post, int filt_lvl,
                               int low_var_thresh, int flag);
 extern void print_parms(VP8_CONFIG *ocf, char *filenam);
 extern unsigned int vp8_get_processor_freq();
 extern void print_tree_update_probs();
-extern int vp8cx_create_encoder_threads(VP8_COMP *cpi);
-extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi);
-
-int vp8_estimate_entropy_savings(VP8_COMP *cpi);
 
 int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
 
-extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance);
-
 static void set_default_lf_deltas(VP8_COMP *cpi);
 
 extern const int vp8_gf_interval_table[101];
@@ -85,6 +86,9 @@ FILE *yuv_file;
 #ifdef OUTPUT_YUV_DENOISED
 FILE *yuv_denoised_file;
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+static FILE *yuv_skinmap_file = NULL;
+#endif
 
 #if 0
 FILE *framepsnr;
@@ -217,7 +221,8 @@ static void save_layer_context(VP8_COMP *cpi) {
   lc->inter_frame_target = cpi->inter_frame_target;
   lc->total_byte_count = cpi->total_byte_count;
   lc->filter_level = cpi->common.filter_level;
-
+  lc->frames_since_last_drop_overshoot = cpi->frames_since_last_drop_overshoot;
+  lc->force_maxqp = cpi->force_maxqp;
   lc->last_frame_percent_intra = cpi->last_frame_percent_intra;
 
   memcpy(lc->count_mb_ref_frame_usage, cpi->mb.count_mb_ref_frame_usage,
@@ -253,7 +258,8 @@ static void restore_layer_context(VP8_COMP *cpi, const int layer) {
   cpi->inter_frame_target = lc->inter_frame_target;
   cpi->total_byte_count = lc->total_byte_count;
   cpi->common.filter_level = lc->filter_level;
-
+  cpi->frames_since_last_drop_overshoot = lc->frames_since_last_drop_overshoot;
+  cpi->force_maxqp = lc->force_maxqp;
   cpi->last_frame_percent_intra = lc->last_frame_percent_intra;
 
   memcpy(cpi->mb.count_mb_ref_frame_usage, lc->count_mb_ref_frame_usage,
@@ -602,6 +608,59 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) {
   set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
 }
 
+static void compute_skin_map(VP8_COMP *cpi) {
+  int mb_row, mb_col, num_bl;
+  VP8_COMMON *cm = &cpi->common;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const uint8_t *src_u = cpi->Source->u_buffer;
+  const uint8_t *src_v = cpi->Source->v_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  const int src_uvstride = cpi->Source->uv_stride;
+
+  const SKIN_DETECTION_BLOCK_SIZE bsize =
+      (cm->Width * cm->Height <= 352 * 288) ? SKIN_8X8 : SKIN_16X16;
+
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+    num_bl = 0;
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+      const int bl_index = mb_row * cm->mb_cols + mb_col;
+      cpi->skin_map[bl_index] =
+          vp8_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride,
+                                 bsize, cpi->consec_zero_last[bl_index], 0);
+      num_bl++;
+      src_y += 16;
+      src_u += 8;
+      src_v += 8;
+    }
+    src_y += (src_ystride << 4) - (num_bl << 4);
+    src_u += (src_uvstride << 3) - (num_bl << 3);
+    src_v += (src_uvstride << 3) - (num_bl << 3);
+  }
+
+  // Remove isolated skin blocks (none of its neighbors are skin) and isolated
+  // non-skin blocks (all of its neighbors are skin). Skip the boundary.
+  for (mb_row = 1; mb_row < cm->mb_rows - 1; mb_row++) {
+    for (mb_col = 1; mb_col < cm->mb_cols - 1; mb_col++) {
+      const int bl_index = mb_row * cm->mb_cols + mb_col;
+      int num_neighbor = 0;
+      int mi, mj;
+      int non_skin_threshold = 8;
+
+      for (mi = -1; mi <= 1; mi += 1) {
+        for (mj = -1; mj <= 1; mj += 1) {
+          int bl_neighbor_index = (mb_row + mi) * cm->mb_cols + mb_col + mj;
+          if (cpi->skin_map[bl_neighbor_index]) num_neighbor++;
+        }
+      }
+
+      if (cpi->skin_map[bl_index] && num_neighbor < 2)
+        cpi->skin_map[bl_index] = 0;
+      if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold)
+        cpi->skin_map[bl_index] = 1;
+    }
+  }
+}
+
 static void set_default_lf_deltas(VP8_COMP *cpi) {
   cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
   cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
@@ -714,6 +773,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) {
   SPEED_FEATURES *sf = &cpi->sf;
   int Mode = cpi->compressor_speed;
   int Speed = cpi->Speed;
+  int Speed2;
   int i;
   VP8_COMMON *cm = &cpi->common;
   int last_improved_quant = sf->improved_quant;
@@ -815,9 +875,16 @@ void vp8_set_speed_features(VP8_COMP *cpi) {
   cpi->mode_check_freq[THR_V_PRED] = cpi->mode_check_freq[THR_H_PRED] =
       cpi->mode_check_freq[THR_B_PRED] =
           speed_map(Speed, mode_check_freq_map_vhbpred);
-  cpi->mode_check_freq[THR_NEW1] = speed_map(Speed, mode_check_freq_map_new1);
+
+  // For real-time mode at speed 10 keep the mode_check_freq threshold
+  // for NEW1 similar to that of speed 9.
+  Speed2 = Speed;
+  if (cpi->Speed == 10 && Mode == 2) Speed2 = RT(9);
+  cpi->mode_check_freq[THR_NEW1] = speed_map(Speed2, mode_check_freq_map_new1);
+
   cpi->mode_check_freq[THR_NEW2] = cpi->mode_check_freq[THR_NEW3] =
       speed_map(Speed, mode_check_freq_map_new2);
+
   cpi->mode_check_freq[THR_SPLIT1] =
       speed_map(Speed, mode_check_freq_map_split1);
   cpi->mode_check_freq[THR_SPLIT2] = cpi->mode_check_freq[THR_SPLIT3] =
@@ -1163,9 +1230,13 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
   }
 
   if (cpi->oxcf.multi_threaded > 1) {
+    int i;
+
     vpx_free(cpi->mt_current_mb_col);
     CHECK_MEM_ERROR(cpi->mt_current_mb_col,
                     vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
+    for (i = 0; i < cm->mb_rows; ++i)
+      vpx_atomic_init(&cpi->mt_current_mb_col[i], 0);
   }
 
 #endif
@@ -1466,6 +1537,12 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
   cpi->baseline_gf_interval =
       cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
 
+  // GF behavior for 1 pass CBR, used when error_resilience is off.
+  if (!cpi->oxcf.error_resilient_mode &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
+      cpi->oxcf.Mode == MODE_REALTIME)
+    cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr;
+
 #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
   cpi->oxcf.token_partitions = 3;
 #endif
@@ -1476,9 +1553,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
 
   setup_features(cpi);
 
-  {
+  if (!cpi->use_roi_static_threshold) {
     int i;
-
     for (i = 0; i < MAX_MB_SEGMENTS; ++i) {
       cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
     }
@@ -1592,8 +1668,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
   cm->sharpness_level = cpi->oxcf.Sharpness;
 
   if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {
-    int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
-    int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+    int hr, hs, vr, vs;
 
     Scale2Ratio(cm->horiz_scale, &hr, &hs);
     Scale2Ratio(cm->vert_scale, &vr, &vs);
@@ -1739,6 +1814,8 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
 
   cpi->active_map_enabled = 0;
 
+  cpi->use_roi_static_threshold = 0;
+
 #if 0
     /* Experimental code for lagged and one pass */
     /* Initialise one_pass GF frames stats */
@@ -1765,9 +1842,13 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   cpi->mse_source_denoised = 0;
 
   /* Should we use the cyclic refresh method.
-   * Currently this is tied to error resilliant mode
+   * Currently there is no external control for this.
+   * Enable it for error_resilient_mode, or for 1 pass CBR mode.
    */
-  cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode;
+  cpi->cyclic_refresh_mode_enabled =
+      (cpi->oxcf.error_resilient_mode ||
+       (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
+        cpi->oxcf.Mode <= 2));
   cpi->cyclic_refresh_mode_max_mbs_perframe =
       (cpi->common.mb_rows * cpi->common.mb_cols) / 7;
   if (cpi->oxcf.number_of_layers == 1) {
@@ -1780,6 +1861,23 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   cpi->cyclic_refresh_mode_index = 0;
   cpi->cyclic_refresh_q = 32;
 
+  // GF behavior for 1 pass CBR, used when error_resilience is off.
+  cpi->gf_update_onepass_cbr = 0;
+  cpi->gf_noboost_onepass_cbr = 0;
+  if (!cpi->oxcf.error_resilient_mode &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && cpi->oxcf.Mode <= 2) {
+    cpi->gf_update_onepass_cbr = 1;
+    cpi->gf_noboost_onepass_cbr = 1;
+    cpi->gf_interval_onepass_cbr =
+        cpi->cyclic_refresh_mode_max_mbs_perframe > 0
+            ? (2 * (cpi->common.mb_rows * cpi->common.mb_cols) /
+               cpi->cyclic_refresh_mode_max_mbs_perframe)
+            : 10;
+    cpi->gf_interval_onepass_cbr =
+        VPXMIN(40, VPXMAX(6, cpi->gf_interval_onepass_cbr));
+    cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr;
+  }
+
   if (cpi->cyclic_refresh_mode_enabled) {
     CHECK_MEM_ERROR(cpi->cyclic_refresh_map,
                     vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
@@ -1787,6 +1885,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
     cpi->cyclic_refresh_map = (signed char *)NULL;
   }
 
+  CHECK_MEM_ERROR(cpi->skin_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
+                                            sizeof(cpi->skin_map[0])));
+
   CHECK_MEM_ERROR(cpi->consec_zero_last,
                   vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
   CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
@@ -1810,6 +1911,7 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   cpi->common.refresh_alt_ref_frame = 0;
 
   cpi->force_maxqp = 0;
+  cpi->frames_since_last_drop_overshoot = 0;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
@@ -1863,6 +1965,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
 #ifdef OUTPUT_YUV_DENOISED
   yuv_denoised_file = fopen("denoised.yuv", "ab");
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+  yuv_skinmap_file = fopen("skinmap.yuv", "wb");
+#endif
 
 #if 0
     framepsnr = fopen("framepsnr.stt", "a");
@@ -2028,11 +2133,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
       double time_encoded =
           (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
           10000000.000;
-      double total_encode_time =
-          (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
       double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
-      const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
-      const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
 
       if (cpi->b_calculate_psnr) {
         if (cpi->oxcf.number_of_layers > 1) {
@@ -2040,7 +2141,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
 
           fprintf(f,
                   "Layer\tBitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t"
-                  "GLPsnrP\tVPXSSIM\t\n");
+                  "GLPsnrP\tVPXSSIM\n");
           for (i = 0; i < (int)cpi->oxcf.number_of_layers; ++i) {
             double dr =
                 (double)cpi->bytes_in_layer[i] * 8.0 / 1000.0 / time_encoded;
@@ -2072,14 +2173,12 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
 
           fprintf(f,
                   "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t"
-                  "GLPsnrP\tVPXSSIM\tTime(us)\tRc-Err\t"
-                  "Abs Err\n");
+                  "GLPsnrP\tVPXSSIM\n");
           fprintf(f,
                   "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                  "%7.3f\t%8.0f\t%7.2f\t%7.2f\n",
+                  "%7.3f\n",
                   dr, cpi->total / cpi->count, total_psnr,
-                  cpi->totalp / cpi->count, total_psnr2, total_ssim,
-                  total_encode_time, rate_err, fabs(rate_err));
+                  cpi->totalp / cpi->count, total_psnr2, total_ssim);
         }
       }
       fclose(f);
@@ -2220,6 +2319,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
   dealloc_compressor_data(cpi);
   vpx_free(cpi->mb.ss);
   vpx_free(cpi->tok);
+  vpx_free(cpi->skin_map);
   vpx_free(cpi->cyclic_refresh_map);
   vpx_free(cpi->consec_zero_last);
   vpx_free(cpi->consec_zero_last_mvbias);
@@ -2234,6 +2334,9 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
 #ifdef OUTPUT_YUV_DENOISED
   fclose(yuv_denoised_file);
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+  fclose(yuv_skinmap_file);
+#endif
 
 #if 0
 
@@ -2296,7 +2399,7 @@ static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
     recon += recon_stride;
   }
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
   return total_sse;
 }
 
@@ -2410,42 +2513,13 @@ int vp8_update_entropy(VP8_COMP *cpi, int update) {
   return 0;
 }
 
-#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED)
-void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
-  unsigned char *src = s->y_buffer;
-  int h = s->y_height;
-
-  do {
-    fwrite(src, s->y_width, 1, yuv_file);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-}
-#endif
-
 static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) {
   VP8_COMMON *cm = &cpi->common;
 
   /* are we resizing the image */
   if (cm->horiz_scale != 0 || cm->vert_scale != 0) {
 #if CONFIG_SPATIAL_RESAMPLING
-    int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
-    int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+    int hr, hs, vr, vs;
     int tmp_height;
 
     if (cm->vert_scale == 3) {
@@ -2478,8 +2552,7 @@ static int resize_key_frame(VP8_COMP *cpi) {
    */
   if (cpi->oxcf.allow_spatial_resampling &&
       (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) {
-    int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
-    int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+    int hr, hs, vr, vs;
     int new_width, new_height;
 
     /* If we are below the resample DOWN watermark then scale down a
@@ -2691,7 +2764,7 @@ static int decide_key_frame(VP8_COMP *cpi) {
   if (cpi->Speed > 11) return 0;
 
   /* Clear down mmx registers */
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if ((cpi->compressor_speed == 2) && (cpi->Speed >= 5) && (cpi->sf.RD == 0)) {
     double change = 1.0 *
@@ -2852,8 +2925,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
 
     cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
     cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
-  } else /* For non key frames */
-  {
+  } else {
     if (cm->refresh_alt_ref_frame) {
       assert(!cm->copy_buffer_to_arf);
 
@@ -2874,8 +2946,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
           cpi->current_ref_frames[ALTREF_FRAME] =
               cpi->current_ref_frames[LAST_FRAME];
         }
-      } else /* if (cm->copy_buffer_to_arf == 2) */
-      {
+      } else {
         if (cm->alt_fb_idx != cm->gld_fb_idx) {
           yv12_fb[cm->gld_fb_idx].flags |= VP8_ALTR_FRAME;
           yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
@@ -2907,8 +2978,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
           cpi->current_ref_frames[GOLDEN_FRAME] =
               cpi->current_ref_frames[LAST_FRAME];
         }
-      } else /* if (cm->copy_buffer_to_gf == 2) */
-      {
+      } else {
         if (cm->alt_fb_idx != cm->gld_fb_idx) {
           yv12_fb[cm->alt_fb_idx].flags |= VP8_GOLD_FRAME;
           yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
@@ -2939,8 +3009,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
       int i;
       for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
         vp8_yv12_copy_frame(cpi->Source, &cpi->denoiser.yv12_running_avg[i]);
-    } else /* For non key frames */
-    {
+    } else {
       vp8_yv12_extend_frame_borders(
           &cpi->denoiser.yv12_running_avg[INTRA_FRAME]);
 
@@ -2995,6 +3064,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source,
   }
   // Only return non-zero if we have at least ~1/16 samples for estimate.
   if (num_blocks > (tot_num_blocks >> 4)) {
+    assert(num_blocks != 0);
     return (Total / num_blocks);
   } else {
     return 0;
@@ -3129,7 +3199,7 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
   } else {
     struct vpx_usec_timer timer;
 
-    vp8_clear_system_state();
+    vpx_clear_system_state();
 
     vpx_usec_timer_start(&timer);
     if (cpi->sf.auto_filter == 0) {
@@ -3171,7 +3241,7 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
   }
 
 #if CONFIG_MULTITHREAD
-  if (cpi->b_multi_threaded) {
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
     sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
   }
 #endif
@@ -3217,7 +3287,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   int drop_mark25 = drop_mark / 8;
 
   /* Clear down mmx registers to allow floating point in what follows */
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if (cpi->force_next_frame_intra) {
     cm->frame_type = KEY_FRAME; /* delayed intra frame */
@@ -3576,7 +3646,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
    * There is some odd behavior for one pass here that needs attention.
    */
   if ((cpi->pass == 2) || (cpi->ni_frames > 150)) {
-    vp8_clear_system_state();
+    vpx_clear_system_state();
 
     Q = cpi->active_worst_quality;
 
@@ -3725,6 +3795,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   }
 #endif
 
+  compute_skin_map(cpi);
+
   /* Setup background Q adjustment for error resilient mode.
    * For multi-layer encodes only enable this for the base layer.
   */
@@ -3798,11 +3870,11 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #endif
 
 #ifdef OUTPUT_YUV_SRC
-  vp8_write_yuv_frame(yuv_file, cpi->Source);
+  vpx_write_yuv_frame(yuv_file, cpi->Source);
 #endif
 
   do {
-    vp8_clear_system_state();
+    vpx_clear_system_state();
 
     vp8_set_quantizer(cpi, Q);
 
@@ -3927,7 +3999,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     /* transform / motion compensation build reconstruction frame */
     vp8_encode_frame(cpi);
 
-    if (cpi->oxcf.screen_content_mode == 2) {
+    if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
       if (vp8_drop_encodedframe_overshoot(cpi, Q)) return;
     }
 
@@ -3935,7 +4007,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     cpi->projected_frame_size =
         (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;
 #endif
-    vp8_clear_system_state();
+    vpx_clear_system_state();
 
     /* Test to see if the stats generated for this frame indicate that
      * we should have coded a key frame (assuming that we didn't)!
@@ -3979,7 +4051,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #endif
     }
 
-    vp8_clear_system_state();
+    vpx_clear_system_state();
 
     if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
 
@@ -4003,9 +4075,9 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #if !CONFIG_REALTIME_ONLY
       top_index = cpi->active_worst_quality;
 #endif  // !CONFIG_REALTIME_ONLY
-        /* If we have updated the active max Q do not call
-         * vp8_update_rate_correction_factors() this loop.
-         */
+      /* If we have updated the active max Q do not call
+       * vp8_update_rate_correction_factors() this loop.
+       */
       active_worst_qchanged = 1;
     } else {
       active_worst_qchanged = 0;
@@ -4204,6 +4276,20 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     }
   } while (Loop == 1);
 
+#if defined(DROP_UNCODED_FRAMES)
+  /* if there are no coded macroblocks at all drop this frame */
+  if (cpi->common.MBs == cpi->mb.skip_true_count &&
+      (cpi->drop_frame_count & 7) != 7 && cm->frame_type != KEY_FRAME) {
+    cpi->common.current_video_frame++;
+    cpi->frames_since_key++;
+    cpi->drop_frame_count++;
+    // We advance the temporal pattern for dropped frames.
+    cpi->temporal_pattern_counter++;
+    return;
+  }
+  cpi->drop_frame_count = 0;
+#endif
+
 #if 0
     /* Experimental code for lagged and one pass
      * Update stats used for one pass GF selection
@@ -4345,11 +4431,20 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   }
 #endif
 
+#ifdef OUTPUT_YUV_SKINMAP
+  if (cpi->common.current_video_frame > 1) {
+    vp8_compute_skin_map(cpi, yuv_skinmap_file);
+  }
+#endif
+
 #if CONFIG_MULTITHREAD
-  if (cpi->b_multi_threaded) {
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
     /* start loopfilter in separate thread */
     sem_post(&cpi->h_event_start_lpf);
     cpi->b_lpf_running = 1;
+    /* wait for the filter_level to be picked so that we can continue with
+     * stream packing */
+    sem_wait(&cpi->h_event_end_lpf);
   } else
 #endif
   {
@@ -4359,7 +4454,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   update_reference_frames(cpi);
 
 #ifdef OUTPUT_YUV_DENOISED
-  vp8_write_yuv_frame(yuv_denoised_file,
+  vpx_write_yuv_frame(yuv_denoised_file,
                       &cpi->denoiser.yv12_running_avg[INTRA_FRAME]);
 #endif
 
@@ -4369,12 +4464,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   }
 #endif
 
-#if CONFIG_MULTITHREAD
-  /* wait that filter_level is picked so that we can continue with stream
-   * packing */
-  if (cpi->b_multi_threaded) sem_wait(&cpi->h_event_end_lpf);
-#endif
-
   /* build the bitstream */
   vp8_pack_bitstream(cpi, dest, dest_end, size);
 
@@ -4549,7 +4638,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     {
         FILE *f = fopen("tmp.stt", "a");
 
-        vp8_clear_system_state();
+        vpx_clear_system_state();
 
         if (cpi->twopass.total_left_stats.coded_error != 0.0)
             fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64
@@ -4708,7 +4797,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #endif
 
   /* DEBUG */
-  /* vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */
+  /* vpx_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */
 }
 #if !CONFIG_REALTIME_ONLY
 static void Pass2Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest,
@@ -4779,7 +4868,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 
   if (setjmp(cpi->common.error.jmp)) {
     cpi->common.error.setjmp = 0;
-    vp8_clear_system_state();
+    vpx_clear_system_state();
     return VPX_CODEC_CORRUPT_FRAME;
   }
 
@@ -4986,7 +5075,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
   *size = 0;
 
   /* Clear down mmx registers */
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   cm->frame_type = INTER_FRAME;
   cm->frame_flags = *frame_flags;
@@ -5139,7 +5228,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 
           vp8_deblock(cm, cm->frame_to_show, &cm->post_proc_buffer,
                       cm->filter_level * 10 / 6, 1, 0);
-          vp8_clear_system_state();
+          vpx_clear_system_state();
 
           ye = calc_plane_error(orig->y_buffer, orig->y_stride, pp->y_buffer,
                                 pp->y_stride, y_width, y_height);
@@ -5216,7 +5305,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 
 #if CONFIG_MULTITHREAD
   /* wait for the lpf thread done */
-  if (cpi->b_multi_threaded && cpi->b_lpf_running) {
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) {
     sem_wait(&cpi->h_event_end_lpf);
     cpi->b_lpf_running = 0;
   }
@@ -5249,7 +5338,7 @@ int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest,
     }
 
 #endif
-    vp8_clear_system_state();
+    vpx_clear_system_state();
     return ret;
   }
 }
@@ -5262,9 +5351,6 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
   const int range = 63;
   int i;
 
-  // This method is currently incompatible with the cyclic refresh method
-  if (cpi->cyclic_refresh_mode_enabled) return -1;
-
   // Check number of rows and columns match
   if (cpi->common.mb_rows != (int)rows || cpi->common.mb_cols != (int)cols) {
     return -1;
@@ -5283,7 +5369,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
     return -1;
   }
 
-  if (!map) {
+  // Also disable segmentation if no deltas are specified.
+  if (!map || (delta_q[0] == 0 && delta_q[1] == 0 && delta_q[2] == 0 &&
+               delta_q[3] == 0 && delta_lf[0] == 0 && delta_lf[1] == 0 &&
+               delta_lf[2] == 0 && delta_lf[3] == 0 && threshold[0] == 0 &&
+               threshold[1] == 0 && threshold[2] == 0 && threshold[3] == 0)) {
     disable_segmentation(cpi);
     return 0;
   }
@@ -5320,6 +5410,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
   /* Initialise the feature data structure */
   set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
 
+  if (threshold[0] != 0 || threshold[1] != 0 || threshold[2] != 0 ||
+      threshold[3] != 0)
+    cpi->use_roi_static_threshold = 1;
+  cpi->cyclic_refresh_mode_enabled = 0;
+
   return 0;
 }
 
diff --git a/libs/libvpx/vp8/encoder/onyx_int.h b/libs/libvpx/vp8/encoder/onyx_int.h
index 59ad5773a6..c489b46c2d 100644
--- a/libs/libvpx/vp8/encoder/onyx_int.h
+++ b/libs/libvpx/vp8/encoder/onyx_int.h
@@ -249,6 +249,10 @@ typedef struct {
 
   int filter_level;
 
+  int frames_since_last_drop_overshoot;
+
+  int force_maxqp;
+
   int last_frame_percent_intra;
 
   int count_mb_ref_frame_usage[MAX_REF_FRAMES];
@@ -413,6 +417,9 @@ typedef struct VP8_COMP {
 
   int drop_frames_allowed; /* Are we permitted to drop frames? */
   int drop_frame;          /* Drop this frame? */
+#if defined(DROP_UNCODED_FRAMES)
+  int drop_frame_count;
+#endif
 
   vp8_prob frame_coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                            [ENTROPY_NODES];
@@ -468,6 +475,8 @@ typedef struct VP8_COMP {
   int zeromv_count;
   int lf_zeromv_pct;
 
+  unsigned char *skin_map;
+
   unsigned char *segmentation_map;
   signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
   int segment_encode_breakout[MAX_MB_SEGMENTS];
@@ -500,12 +509,18 @@ typedef struct VP8_COMP {
   int mse_source_denoised;
 
   int force_maxqp;
+  int frames_since_last_drop_overshoot;
+
+  // GF update for 1 pass cbr.
+  int gf_update_onepass_cbr;
+  int gf_interval_onepass_cbr;
+  int gf_noboost_onepass_cbr;
 
 #if CONFIG_MULTITHREAD
   /* multithread data */
-  int *mt_current_mb_col;
+  vpx_atomic_int *mt_current_mb_col;
   int mt_sync_range;
-  int b_multi_threaded;
+  vpx_atomic_int b_multi_threaded;
   int encoding_thread_count;
   int b_lpf_running;
 
@@ -677,6 +692,9 @@ typedef struct VP8_COMP {
     int token_costs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                    [MAX_ENTROPY_TOKENS];
   } rd_costs;
+
+  // Use the static threshold from ROI settings.
+  int use_roi_static_threshold;
 } VP8_COMP;
 
 void vp8_initialize_enc(void);
diff --git a/libs/libvpx/vp8/encoder/pickinter.c b/libs/libvpx/vp8/encoder/pickinter.c
index 7b68d35f50..a9943eb6ab 100644
--- a/libs/libvpx/vp8/encoder/pickinter.c
+++ b/libs/libvpx/vp8/encoder/pickinter.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <limits.h>
 #include "vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
@@ -24,6 +25,7 @@
 #include "vp8/common/reconintra4x4.h"
 #include "vpx_dsp/variance.h"
 #include "mcomp.h"
+#include "vp8/common/vp8_skin_detection.h"
 #include "rdopt.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
@@ -35,82 +37,9 @@
 extern unsigned int cnt_pm;
 #endif
 
-#define MODEL_MODE 1
-
 extern const int vp8_ref_frame_order[MAX_MODES];
 extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
 
-// Fixed point implementation of a skin color classifier. Skin color
-// is model by a Gaussian distribution in the CbCr color space.
-// See ../../test/skin_color_detector_test.cc where the reference
-// skin color classifier is defined.
-
-// Fixed-point skin color model parameters.
-static const int skin_mean[5][2] = { { 7463, 9614 },
-                                     { 6400, 10240 },
-                                     { 7040, 10240 },
-                                     { 8320, 9280 },
-                                     { 6800, 9614 } };
-static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 };  // q16
-static const int skin_threshold[6] = { 1570636, 1400000, 800000,
-                                       800000,  800000,  800000 };  // q18
-
-// Evaluates the Mahalanobis distance measure for the input CbCr values.
-static int evaluate_skin_color_difference(int cb, int cr, int idx) {
-  const int cb_q6 = cb << 6;
-  const int cr_q6 = cr << 6;
-  const int cb_diff_q12 =
-      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
-  const int cbcr_diff_q12 =
-      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
-  const int cr_diff_q12 =
-      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
-  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
-  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
-  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
-  const int skin_diff =
-      skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
-      skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
-  return skin_diff;
-}
-
-// Checks if the input yCbCr values corresponds to skin color.
-static int is_skin_color(int y, int cb, int cr, int consec_zeromv) {
-  if (y < 40 || y > 220) {
-    return 0;
-  } else {
-    if (MODEL_MODE == 0) {
-      return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
-    } else {
-      int i = 0;
-      // No skin if block has been zero motion for long consecutive time.
-      if (consec_zeromv > 60) return 0;
-      // Exit on grey.
-      if (cb == 128 && cr == 128) return 0;
-      // Exit on very strong cb.
-      if (cb > 150 && cr < 110) return 0;
-      for (; i < 5; ++i) {
-        int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
-        if (skin_color_diff < skin_threshold[i + 1]) {
-          if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) {
-            return 0;
-          } else if (consec_zeromv > 25 &&
-                     skin_color_diff > (skin_threshold[i + 1] >> 1)) {
-            return 0;
-          } else {
-            return 1;
-          }
-        }
-        // Exit if difference is much large than the threshold.
-        if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
-          return 0;
-        }
-      }
-      return 0;
-    }
-  }
-}
-
 static int macroblock_corner_grad(unsigned char *signal, int stride,
                                   int offsetx, int offsety, int sgnx,
                                   int sgny) {
@@ -206,8 +135,8 @@ int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
   (void)mvcost;
   (void)distortion;
   (void)sse;
-  bestmv->as_mv.row <<= 3;
-  bestmv->as_mv.col <<= 3;
+  bestmv->as_mv.row *= 8;
+  bestmv->as_mv.col *= 8;
   return 0;
 }
 
@@ -299,8 +228,8 @@ static int pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *best_dist) {
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
 
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(d);
+    B_PREDICTION_MODE best_mode = B_MODE_COUNT;
+    int r = 0, d = 0;
 
     if (mb->e_mbd.frame_type == KEY_FRAME) {
       const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
@@ -313,6 +242,7 @@ static int pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *best_dist) {
 
     cost += r;
     distortion += d;
+    assert(best_mode != B_MODE_COUNT);
     mic->bmi[i].as_mode = best_mode;
 
     /* Break out case where we have already exceeded best so far value
@@ -353,7 +283,7 @@ static void pick_intra_mbuv_mode(MACROBLOCK *mb) {
   int Vaverage = 0;
   int diff;
   int pred_error[4] = { 0, 0, 0, 0 }, best_error = INT_MAX;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+  MB_PREDICTION_MODE best_mode = MB_MODE_COUNT;
 
   for (i = 0; i < 8; ++i) {
     uleft_col[i] = x->dst.u_buffer[i * x->dst.uv_stride - 1];
@@ -442,6 +372,7 @@ static void pick_intra_mbuv_mode(MACROBLOCK *mb) {
     }
   }
 
+  assert(best_mode != MB_MODE_COUNT);
   mb->e_mbd.mode_info_context->mbmi.uv_mode = best_mode;
 }
 
@@ -757,27 +688,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #endif
 
   // Check if current macroblock is in skin area.
-  {
-    const int y = (x->src.y_buffer[7 * x->src.y_stride + 7] +
-                   x->src.y_buffer[7 * x->src.y_stride + 8] +
-                   x->src.y_buffer[8 * x->src.y_stride + 7] +
-                   x->src.y_buffer[8 * x->src.y_stride + 8]) >>
-                  2;
-    const int cb = (x->src.u_buffer[3 * x->src.uv_stride + 3] +
-                    x->src.u_buffer[3 * x->src.uv_stride + 4] +
-                    x->src.u_buffer[4 * x->src.uv_stride + 3] +
-                    x->src.u_buffer[4 * x->src.uv_stride + 4]) >>
-                   2;
-    const int cr = (x->src.v_buffer[3 * x->src.uv_stride + 3] +
-                    x->src.v_buffer[3 * x->src.uv_stride + 4] +
-                    x->src.v_buffer[4 * x->src.uv_stride + 3] +
-                    x->src.v_buffer[4 * x->src.uv_stride + 4]) >>
-                   2;
-    x->is_skin = 0;
-    if (!cpi->oxcf.screen_content_mode) {
-      int block_index = mb_row * cpi->common.mb_cols + mb_col;
-      x->is_skin = is_skin_color(y, cb, cr, cpi->consec_zero_last[block_index]);
-    }
+  x->is_skin = 0;
+  if (!cpi->oxcf.screen_content_mode) {
+    int block_index = mb_row * cpi->common.mb_cols + mb_col;
+    x->is_skin = cpi->skin_map[block_index];
   }
 #if CONFIG_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity) {
diff --git a/libs/libvpx/vp8/encoder/picklpf.c b/libs/libvpx/vp8/encoder/picklpf.c
index 6f287322ec..b1b712db9a 100644
--- a/libs/libvpx/vp8/encoder/picklpf.c
+++ b/libs/libvpx/vp8/encoder/picklpf.c
@@ -12,6 +12,7 @@
 #include "./vpx_scale_rtcd.h"
 #include "vp8/common/onyxc_int.h"
 #include "onyx_int.h"
+#include "vp8/encoder/picklpf.h"
 #include "vp8/encoder/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/vpx_scale.h"
diff --git a/libs/libvpx/vp8/encoder/picklpf.h b/libs/libvpx/vp8/encoder/picklpf.h
new file mode 100644
index 0000000000..e6ad0dbf26
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/picklpf.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_PICKLPF_H_
+#define VP8_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+struct yv12_buffer_config;
+
+void vp8cx_pick_filter_level_fast(struct yv12_buffer_config *sd,
+                                  struct VP8_COMP *cpi);
+void vp8cx_set_alt_lf_level(struct VP8_COMP *cpi, int filt_val);
+void vp8cx_pick_filter_level(struct yv12_buffer_config *sd, VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP8_ENCODER_PICKLPF_H_
diff --git a/libs/libvpx/vp8/encoder/ratectrl.c b/libs/libvpx/vp8/encoder/ratectrl.c
index 649f696dc7..e58c310980 100644
--- a/libs/libvpx/vp8/encoder/ratectrl.c
+++ b/libs/libvpx/vp8/encoder/ratectrl.c
@@ -22,6 +22,7 @@
 #include "vp8/common/systemdependent.h"
 #include "encodemv.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/system_state.h"
 
 #define MIN_BPB_FACTOR 0.01
 #define MAX_BPB_FACTOR 50
@@ -296,7 +297,7 @@ static void calc_iframe_target_size(VP8_COMP *cpi) {
   uint64_t target;
 
   /* Clear down mmx registers to allow floating point in what follows */
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if (cpi->oxcf.fixed_q >= 0) {
     int Q = cpi->oxcf.key_q;
@@ -497,11 +498,9 @@ static void calc_gf_params(VP8_COMP *cpi) {
    * This is updated once the real frame size/boost is known.
    */
   if (cpi->oxcf.fixed_q == -1) {
-    if (cpi->pass == 2) /* 2 Pass */
-    {
+    if (cpi->pass == 2) { /* 2 Pass */
       cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-    } else /* 1 Pass */
-    {
+    } else { /* 1 Pass */
       cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
 
       if (cpi->last_boost > 750) cpi->frames_till_gf_update_due++;
@@ -884,61 +883,61 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
   /* Adjust target frame size for Golden Frames: */
   if (cpi->oxcf.error_resilient_mode == 0 &&
       (cpi->frames_till_gf_update_due == 0) && !cpi->drop_frame) {
-    int Q =
-        (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+    if (!cpi->gf_update_onepass_cbr) {
+      int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME]
+                                      : cpi->oxcf.fixed_q;
 
-    int gf_frame_useage = 0; /* Golden frame useage since last GF */
-    int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
-                  cpi->recent_ref_frame_usage[LAST_FRAME] +
-                  cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
-                  cpi->recent_ref_frame_usage[ALTREF_FRAME];
+      int gf_frame_useage = 0; /* Golden frame useage since last GF */
+      int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
+                    cpi->recent_ref_frame_usage[LAST_FRAME] +
+                    cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                    cpi->recent_ref_frame_usage[ALTREF_FRAME];
 
-    int pct_gf_active = (100 * cpi->gf_active_count) /
-                        (cpi->common.mb_rows * cpi->common.mb_cols);
+      int pct_gf_active = (100 * cpi->gf_active_count) /
+                          (cpi->common.mb_rows * cpi->common.mb_cols);
 
-    if (tot_mbs) {
-      gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
-                         cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
-                        100 / tot_mbs;
-    }
-
-    if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active;
-
-    /* Is a fixed manual GF frequency being used */
-    if (cpi->auto_gold) {
-      /* For one pass throw a GF if recent frame intra useage is
-       * low or the GF useage is high
-       */
-      if ((cpi->pass == 0) &&
-          (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) {
-        cpi->common.refresh_golden_frame = 1;
-
-        /* Two pass GF descision */
-      } else if (cpi->pass == 2) {
-        cpi->common.refresh_golden_frame = 1;
+      if (tot_mbs) {
+        gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                           cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
+                          100 / tot_mbs;
+      }
+
+      if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active;
+
+      /* Is a fixed manual GF frequency being used */
+      if (cpi->auto_gold) {
+        /* For one pass throw a GF if recent frame intra useage is
+         * low or the GF useage is high
+         */
+        if ((cpi->pass == 0) &&
+            (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) {
+          cpi->common.refresh_golden_frame = 1;
+
+          /* Two pass GF descision */
+        } else if (cpi->pass == 2) {
+          cpi->common.refresh_golden_frame = 1;
+        }
       }
-    }
 
 #if 0
 
-        /* Debug stats */
-        if (0)
-        {
-            FILE *f;
+          /* Debug stats */
+          if (0) {
+              FILE *f;
 
-            f = fopen("gf_useaget.stt", "a");
-            fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n",
-                    cpi->common.current_video_frame,  cpi->gfu_boost, GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage);
-            fclose(f);
-        }
+              f = fopen("gf_useaget.stt", "a");
+              fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n",
+                      cpi->common.current_video_frame,  cpi->gfu_boost,
+                      GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage);
+              fclose(f);
+          }
 
 #endif
 
-    if (cpi->common.refresh_golden_frame == 1) {
+      if (cpi->common.refresh_golden_frame == 1) {
 #if 0
 
-            if (0)
-            {
+            if (0) {
                 FILE *f;
 
                 f = fopen("GFexit.stt", "a");
@@ -948,61 +947,76 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
 
 #endif
 
-      if (cpi->auto_adjust_gold_quantizer) {
-        calc_gf_params(cpi);
-      }
-
-      /* If we are using alternate ref instead of gf then do not apply the
-       * boost It will instead be applied to the altref update Jims
-       * modified boost
-       */
-      if (!cpi->source_alt_ref_active) {
-        if (cpi->oxcf.fixed_q < 0) {
-          if (cpi->pass == 2) {
-            /* The spend on the GF is defined in the two pass
-             * code for two pass encodes
-             */
-            cpi->this_frame_target = cpi->per_frame_bandwidth;
-          } else {
-            int Boost = cpi->last_boost;
-            int frames_in_section = cpi->frames_till_gf_update_due + 1;
-            int allocation_chunks = (frames_in_section * 100) + (Boost - 100);
-            int bits_in_section = cpi->inter_frame_target * frames_in_section;
-
-            /* Normalize Altboost and allocations chunck down to
-             * prevent overflow
-             */
-            while (Boost > 1000) {
-              Boost /= 2;
-              allocation_chunks /= 2;
-            }
-
-            /* Avoid loss of precision but avoid overflow */
-            if ((bits_in_section >> 7) > allocation_chunks) {
-              cpi->this_frame_target =
-                  Boost * (bits_in_section / allocation_chunks);
-            } else {
-              cpi->this_frame_target =
-                  (Boost * bits_in_section) / allocation_chunks;
-            }
-          }
-        } else {
-          cpi->this_frame_target =
-              (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0) *
-               cpi->last_boost) /
-              100;
+        if (cpi->auto_adjust_gold_quantizer) {
+          calc_gf_params(cpi);
         }
 
-      }
-      /* If there is an active ARF at this location use the minimum
-       * bits on this frame even if it is a contructed arf.
-       * The active maximum quantizer insures that an appropriate
-       * number of bits will be spent if needed for contstructed ARFs.
-       */
-      else {
-        cpi->this_frame_target = 0;
-      }
+        /* If we are using alternate ref instead of gf then do not apply the
+         * boost It will instead be applied to the altref update Jims
+         * modified boost
+         */
+        if (!cpi->source_alt_ref_active) {
+          if (cpi->oxcf.fixed_q < 0) {
+            if (cpi->pass == 2) {
+              /* The spend on the GF is defined in the two pass
+               * code for two pass encodes
+               */
+              cpi->this_frame_target = cpi->per_frame_bandwidth;
+            } else {
+              int Boost = cpi->last_boost;
+              int frames_in_section = cpi->frames_till_gf_update_due + 1;
+              int allocation_chunks = (frames_in_section * 100) + (Boost - 100);
+              int bits_in_section = cpi->inter_frame_target * frames_in_section;
 
+              /* Normalize Altboost and allocations chunck down to
+               * prevent overflow
+               */
+              while (Boost > 1000) {
+                Boost /= 2;
+                allocation_chunks /= 2;
+              }
+
+              /* Avoid loss of precision but avoid overflow */
+              if ((bits_in_section >> 7) > allocation_chunks) {
+                cpi->this_frame_target =
+                    Boost * (bits_in_section / allocation_chunks);
+              } else {
+                cpi->this_frame_target =
+                    (Boost * bits_in_section) / allocation_chunks;
+              }
+            }
+          } else {
+            cpi->this_frame_target =
+                (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0) *
+                 cpi->last_boost) /
+                100;
+          }
+        } else {
+          /* If there is an active ARF at this location use the minimum
+           * bits on this frame even if it is a contructed arf.
+           * The active maximum quantizer insures that an appropriate
+           * number of bits will be spent if needed for contstructed ARFs.
+          */
+          cpi->this_frame_target = 0;
+        }
+
+        cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+      }
+    } else {
+      // Special case for 1 pass CBR: fixed gf period.
+      // TODO(marpan): Adjust this boost/interval logic.
+      // If gf_cbr_boost_pct is small (below threshold) set the flag
+      // gf_noboost_onepass_cbr = 1, which forces the gf to use the same
+      // rate correction factor as last.
+      cpi->gf_noboost_onepass_cbr = (cpi->oxcf.gf_cbr_boost_pct <= 100);
+      cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr;
+      // Skip this update if the zero_mvcount is low.
+      if (cpi->zeromv_count > (cpi->common.MBs >> 1)) {
+        cpi->common.refresh_golden_frame = 1;
+        cpi->this_frame_target =
+            (cpi->this_frame_target * (100 + cpi->oxcf.gf_cbr_boost_pct)) / 100;
+      }
+      cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
       cpi->current_gf_interval = cpi->frames_till_gf_update_due;
     }
   }
@@ -1019,13 +1033,14 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
   int projected_size_based_on_q = 0;
 
   /* Clear down mmx registers to allow floating point in what follows */
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   if (cpi->common.frame_type == KEY_FRAME) {
     rate_correction_factor = cpi->key_frame_rate_correction_factor;
   } else {
-    if (cpi->oxcf.number_of_layers == 1 && (cpi->common.refresh_alt_ref_frame ||
-                                            cpi->common.refresh_golden_frame)) {
+    if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr &&
+        (cpi->common.refresh_alt_ref_frame ||
+         cpi->common.refresh_golden_frame)) {
       rate_correction_factor = cpi->gf_rate_correction_factor;
     } else {
       rate_correction_factor = cpi->rate_correction_factor;
@@ -1101,8 +1116,9 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
   if (cpi->common.frame_type == KEY_FRAME) {
     cpi->key_frame_rate_correction_factor = rate_correction_factor;
   } else {
-    if (cpi->oxcf.number_of_layers == 1 && (cpi->common.refresh_alt_ref_frame ||
-                                            cpi->common.refresh_golden_frame)) {
+    if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr &&
+        (cpi->common.refresh_alt_ref_frame ||
+         cpi->common.refresh_golden_frame)) {
       cpi->gf_rate_correction_factor = rate_correction_factor;
     } else {
       cpi->rate_correction_factor = rate_correction_factor;
@@ -1117,7 +1133,6 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
     cpi->active_worst_quality = cpi->worst_quality;
     return cpi->worst_quality;
   }
-
   /* Reset Zbin OQ value */
   cpi->mb.zbin_over_quant = 0;
 
@@ -1127,10 +1142,12 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
     if (cpi->common.frame_type == KEY_FRAME) {
       Q = cpi->oxcf.key_q;
     } else if (cpi->oxcf.number_of_layers == 1 &&
-               cpi->common.refresh_alt_ref_frame) {
+               cpi->common.refresh_alt_ref_frame &&
+               !cpi->gf_noboost_onepass_cbr) {
       Q = cpi->oxcf.alt_q;
     } else if (cpi->oxcf.number_of_layers == 1 &&
-               cpi->common.refresh_golden_frame) {
+               cpi->common.refresh_golden_frame &&
+               !cpi->gf_noboost_onepass_cbr) {
       Q = cpi->oxcf.gold_q;
     }
   } else {
@@ -1144,7 +1161,7 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
     if (cpi->common.frame_type == KEY_FRAME) {
       correction_factor = cpi->key_frame_rate_correction_factor;
     } else {
-      if (cpi->oxcf.number_of_layers == 1 &&
+      if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr &&
           (cpi->common.refresh_alt_ref_frame ||
            cpi->common.refresh_golden_frame)) {
         correction_factor = cpi->gf_rate_correction_factor;
@@ -1198,6 +1215,7 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
       if (cpi->common.frame_type == KEY_FRAME) {
         zbin_oqmax = 0;
       } else if (cpi->oxcf.number_of_layers == 1 &&
+                 !cpi->gf_noboost_onepass_cbr &&
                  (cpi->common.refresh_alt_ref_frame ||
                   (cpi->common.refresh_golden_frame &&
                    !cpi->source_alt_ref_active))) {
@@ -1302,7 +1320,7 @@ static int estimate_keyframe_frequency(VP8_COMP *cpi) {
 
 void vp8_adjust_key_frame_context(VP8_COMP *cpi) {
   /* Clear down mmx registers to allow floating point in what follows */
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   /* Do we have any key frame overspend to recover? */
   /* Two-pass overspend handled elsewhere. */
@@ -1422,12 +1440,33 @@ int vp8_pick_frame_size(VP8_COMP *cpi) {
 // If this just encoded frame (mcomp/transform/quant, but before loopfilter and
 // pack_bitstream) has large overshoot, and was not being encoded close to the
 // max QP, then drop this frame and force next frame to be encoded at max QP.
-// Condition this on 1 pass CBR with screen content mode and frame dropper off.
+// Allow this for screen_content_mode = 2, or if drop frames is allowed.
 // TODO(marpan): Should do this exit condition during the encode_frame
 // (i.e., halfway during the encoding of the frame) to save cycles.
 int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
-  if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
-      cpi->drop_frames_allowed == 0 && cpi->common.frame_type != KEY_FRAME) {
+  int force_drop_overshoot = 0;
+#if CONFIG_MULTI_RES_ENCODING
+  // Only check for dropping due to overshoot on the lowest stream.
+  // If the lowest stream of the multi-res encoding was dropped due to
+  // overshoot, then force dropping on all upper layer streams
+  // (mr_encoder_id > 0).
+  LOWER_RES_FRAME_INFO *low_res_frame_info =
+      (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info;
+  if (cpi->oxcf.mr_total_resolutions > 1 && cpi->oxcf.mr_encoder_id > 0) {
+    force_drop_overshoot = low_res_frame_info->is_frame_dropped_overshoot_maxqp;
+    if (!force_drop_overshoot) {
+      cpi->force_maxqp = 0;
+      cpi->frames_since_last_drop_overshoot++;
+      return 0;
+    }
+  }
+#endif
+  if (cpi->common.frame_type != KEY_FRAME &&
+      (cpi->oxcf.screen_content_mode == 2 ||
+       (cpi->drop_frames_allowed &&
+        (force_drop_overshoot ||
+         (cpi->rate_correction_factor < (4.0f * MIN_BPB_FACTOR) &&
+          cpi->frames_since_last_drop_overshoot > (int)cpi->framerate))))) {
     // Note: the "projected_frame_size" from encode_frame() only gives estimate
     // of mode/motion vector rate (in non-rd mode): so below we only require
     // that projected_frame_size is somewhat greater than per-frame-bandwidth,
@@ -1438,17 +1477,20 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
     // Rate threshold, in bytes.
     int thresh_rate = 2 * (cpi->av_per_frame_bandwidth >> 3);
     // Threshold for the average (over all macroblocks) of the pixel-sum
-    // residual error over 16x16 block. Should add QP dependence on threshold?
-    int thresh_pred_err_mb = (256 << 4);
+    // residual error over 16x16 block.
+    int thresh_pred_err_mb = (200 << 4);
     int pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs);
-    if (Q < thresh_qp && cpi->projected_frame_size > thresh_rate &&
-        pred_err_mb > thresh_pred_err_mb) {
+    // Reduce/ignore thresh_rate if pred_err_mb much larger than its threshold,
+    // give more weight to pred_err metric for overshoot detection.
+    if (cpi->drop_frames_allowed && pred_err_mb > (thresh_pred_err_mb << 4))
+      thresh_rate = thresh_rate >> 3;
+    if ((Q < thresh_qp && cpi->projected_frame_size > thresh_rate &&
+         pred_err_mb > thresh_pred_err_mb) ||
+        force_drop_overshoot) {
+      unsigned int i;
       double new_correction_factor;
-      const int target_size = cpi->av_per_frame_bandwidth;
       int target_bits_per_mb;
-      // Drop this frame: advance frame counters, and set force_maxqp flag.
-      cpi->common.current_video_frame++;
-      cpi->frames_since_key++;
+      const int target_size = cpi->av_per_frame_bandwidth;
       // Flag to indicate we will force next frame to be encoded at max QP.
       cpi->force_maxqp = 1;
       // Reset the buffer levels.
@@ -1479,14 +1521,40 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
       if (cpi->rate_correction_factor > MAX_BPB_FACTOR) {
         cpi->rate_correction_factor = MAX_BPB_FACTOR;
       }
+      // Drop this frame: update frame counters.
+      cpi->common.current_video_frame++;
+      cpi->frames_since_key++;
+      cpi->temporal_pattern_counter++;
+      cpi->frames_since_last_drop_overshoot = 0;
+      if (cpi->oxcf.number_of_layers > 1) {
+        // Set max_qp and rate correction for all temporal layers if overshoot
+        // is detected.
+        for (i = 0; i < cpi->oxcf.number_of_layers; ++i) {
+          LAYER_CONTEXT *lc = &cpi->layer_context[i];
+          lc->force_maxqp = 1;
+          lc->frames_since_last_drop_overshoot = 0;
+          lc->rate_correction_factor = cpi->rate_correction_factor;
+        }
+      }
+#if CONFIG_MULTI_RES_ENCODING
+      if (cpi->oxcf.mr_total_resolutions > 1)
+        low_res_frame_info->is_frame_dropped_overshoot_maxqp = 1;
+#endif
       return 1;
-    } else {
-      cpi->force_maxqp = 0;
-      return 0;
     }
     cpi->force_maxqp = 0;
+    cpi->frames_since_last_drop_overshoot++;
+#if CONFIG_MULTI_RES_ENCODING
+    if (cpi->oxcf.mr_total_resolutions > 1)
+      low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0;
+#endif
     return 0;
   }
   cpi->force_maxqp = 0;
+  cpi->frames_since_last_drop_overshoot++;
+#if CONFIG_MULTI_RES_ENCODING
+  if (cpi->oxcf.mr_total_resolutions > 1)
+    low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0;
+#endif
   return 0;
 }
diff --git a/libs/libvpx/vp8/encoder/rdopt.c b/libs/libvpx/vp8/encoder/rdopt.c
index 9ba301e080..e210b44105 100644
--- a/libs/libvpx/vp8/encoder/rdopt.c
+++ b/libs/libvpx/vp8/encoder/rdopt.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <stdio.h>
 #include <math.h>
 #include <limits.h>
@@ -15,12 +16,14 @@
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
+#include "encodeframe.h"
 #include "tokenize.h"
 #include "treewriter.h"
 #include "onyx_int.h"
 #include "modecosts.h"
 #include "encodeintra.h"
 #include "pickinter.h"
+#include "vp8/common/common.h"
 #include "vp8/common/entropymode.h"
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra.h"
@@ -30,6 +33,7 @@
 #include "encodemb.h"
 #include "vp8/encoder/quantize.h"
 #include "vpx_dsp/variance.h"
+#include "vpx_ports/system_state.h"
 #include "mcomp.h"
 #include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"
@@ -105,11 +109,10 @@ const int vp8_ref_frame_order[MAX_MODES] = {
   0,
 };
 
-static void fill_token_costs(int c[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
-                                  [MAX_ENTROPY_TOKENS],
-                             const vp8_prob p[BLOCK_TYPES][COEF_BANDS]
-                                             [PREV_COEF_CONTEXTS]
-                                             [ENTROPY_NODES]) {
+static void fill_token_costs(
+    int c[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
+    const vp8_prob p[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
+                    [ENTROPY_NODES]) {
   int i, j, k;
 
   for (i = 0; i < BLOCK_TYPES; ++i) {
@@ -163,7 +166,7 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue) {
   double capped_q = (Qvalue < 160) ? (double)Qvalue : 160.0;
   double rdconst = 2.80;
 
-  vp8_clear_system_state();
+  vpx_clear_system_state();
 
   /* Further tests required to see if optimum is different
    * for key frames, golden frames and arf frames.
@@ -607,9 +610,8 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y,
   for (i = 0; i < 16; ++i) {
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry),
-        UNINITIALIZED_IS_SAFE(d);
+    B_PREDICTION_MODE best_mode = B_MODE_COUNT;
+    int r = 0, ry = 0, d = 0;
 
     if (mb->e_mbd.frame_type == KEY_FRAME) {
       const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
@@ -626,6 +628,7 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y,
     distortion += d;
     tot_rate_y += ry;
 
+    assert(best_mode != B_MODE_COUNT);
     mic->bmi[i].as_mode = best_mode;
 
     if (total_rd >= (int64_t)best_rd) break;
@@ -643,7 +646,7 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y,
 static int rd_pick_intra16x16mby_mode(MACROBLOCK *x, int *Rate, int *rate_y,
                                       int *Distortion) {
   MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  MB_PREDICTION_MODE mode_selected = MB_MODE_COUNT;
   int rate, ratey;
   int distortion;
   int best_rd = INT_MAX;
@@ -673,6 +676,7 @@ static int rd_pick_intra16x16mby_mode(MACROBLOCK *x, int *Rate, int *rate_y,
     }
   }
 
+  assert(mode_selected != MB_MODE_COUNT);
   xd->mode_info_context->mbmi.mode = mode_selected;
   return best_rd;
 }
@@ -740,9 +744,9 @@ static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
 static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
                                     int *rate_tokenonly, int *distortion) {
   MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  MB_PREDICTION_MODE mode_selected = MB_MODE_COUNT;
   int best_rd = INT_MAX;
-  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
+  int d = 0, r = 0;
   int rate_to;
   MACROBLOCKD *xd = &x->e_mbd;
 
@@ -786,6 +790,7 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
   *rate = r;
   *distortion = d;
 
+  assert(mode_selected != MB_MODE_COUNT);
   xd->mode_info_context->mbmi.uv_mode = mode_selected;
 }
 
@@ -849,8 +854,7 @@ static int labels2mode(MACROBLOCK *x, int const *labelings, int which_label,
         default: break;
       }
 
-      if (m == ABOVE4X4) /* replace above with left if same */
-      {
+      if (m == ABOVE4X4) { /* replace above with left if same */
         int_mv left_mv;
 
         left_mv.as_int = col ? d[-1].bmi.mv.as_int : left_block_mv(mic, i);
@@ -956,19 +960,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
   vp8_variance_fn_ptr_t *v_fn_ptr;
 
   ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
   ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
-  ENTROPY_CONTEXT *ta_b;
-  ENTROPY_CONTEXT *tl_b;
 
   memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
   memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-  ta_b = (ENTROPY_CONTEXT *)&t_above_b;
-  tl_b = (ENTROPY_CONTEXT *)&t_left_b;
+  vp8_zero(t_above_b);
+  vp8_zero(t_left_b);
 
   br = 0;
   bd = 0;
@@ -1148,13 +1146,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
         mode_selected = this_mode;
         best_label_rd = this_rd;
 
-        memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
-        memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(&t_above_b, &t_above_s, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(&t_left_b, &t_left_s, sizeof(ENTROPY_CONTEXT_PLANES));
       }
     } /*for each 4x4 mode*/
 
-    memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
-    memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, &t_above_b, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, &t_left_b, sizeof(ENTROPY_CONTEXT_PLANES));
 
     labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
                 bsi->ref_mv, x->mvcost);
diff --git a/libs/libvpx/vp8/encoder/rdopt.h b/libs/libvpx/vp8/encoder/rdopt.h
index 8186ff1051..960bd8f1cd 100644
--- a/libs/libvpx/vp8/encoder/rdopt.h
+++ b/libs/libvpx/vp8/encoder/rdopt.h
@@ -19,6 +19,9 @@ extern "C" {
 
 #define RDCOST(RM, DM, R, D) (((128 + (R) * (RM)) >> 8) + (DM) * (D))
 
+void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
+void vp8_auto_select_speed(VP8_COMP *cpi);
+
 static INLINE void insertsortmv(int arr[], int len) {
   int i, j, k;
 
diff --git a/libs/libvpx/vp8/encoder/temporal_filter.c b/libs/libvpx/vp8/encoder/temporal_filter.c
index 1b2f46bb69..0a7d25fb0a 100644
--- a/libs/libvpx/vp8/encoder/temporal_filter.c
+++ b/libs/libvpx/vp8/encoder/temporal_filter.c
@@ -20,6 +20,7 @@
 #include "ratectrl.h"
 #include "vp8/common/quant_common.h"
 #include "segmentation.h"
+#include "temporal_filter.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/swapyv12buffer.h"
 #include "vp8/common/threading.h"
diff --git a/libs/libvpx/vp8/encoder/temporal_filter.h b/libs/libvpx/vp8/encoder/temporal_filter.h
new file mode 100644
index 0000000000..865d909fb6
--- /dev/null
+++ b/libs/libvpx/vp8/encoder/temporal_filter.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_TEMPORAL_FILTER_H_
+#define VP8_ENCODER_TEMPORAL_FILTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+
+void vp8_temporal_filter_prepare_c(struct VP8_COMP *cpi, int distance);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP8_ENCODER_TEMPORAL_FILTER_H_
diff --git a/libs/libvpx/vp8/encoder/x86/dct_mmx.asm b/libs/libvpx/vp8/encoder/x86/dct_mmx.asm
deleted file mode 100644
index 6f188cb94a..0000000000
--- a/libs/libvpx/vp8/encoder/x86/dct_mmx.asm
+++ /dev/null
@@ -1,241 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
-global sym(vp8_short_fdct4x4_mmx) PRIVATE
-sym(vp8_short_fdct4x4_mmx):
-    push        rbp
-    mov         rbp,        rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0)      ; input
-        mov         rdi,        arg(1)      ; output
-
-        movsxd      rax,        dword ptr arg(2) ;pitch
-
-        lea         rcx,        [rsi + rax*2]
-        ; read the input data
-        movq        mm0,        [rsi]
-        movq        mm1,        [rsi + rax]
-
-        movq        mm2,        [rcx]
-        movq        mm4,        [rcx + rax]
-
-        ; transpose for the first stage
-        movq        mm3,        mm0         ; 00 01 02 03
-        movq        mm5,        mm2         ; 20 21 22 23
-
-        punpcklwd   mm0,        mm1         ; 00 10 01 11
-        punpckhwd   mm3,        mm1         ; 02 12 03 13
-
-        punpcklwd   mm2,        mm4         ; 20 30 21 31
-        punpckhwd   mm5,        mm4         ; 22 32 23 33
-
-        movq        mm1,        mm0         ; 00 10 01 11
-        punpckldq   mm0,        mm2         ; 00 10 20 30
-
-        punpckhdq   mm1,        mm2         ; 01 11 21 31
-
-        movq        mm2,        mm3         ; 02 12 03 13
-        punpckldq   mm2,        mm5         ; 02 12 22 32
-
-        punpckhdq   mm3,        mm5         ; 03 13 23 33
-
-        ; mm0 0
-        ; mm1 1
-        ; mm2 2
-        ; mm3 3
-
-        ; first stage
-        movq        mm5,        mm0
-        movq        mm4,        mm1
-
-        paddw       mm0,        mm3         ; a1 = 0 + 3
-        paddw       mm1,        mm2         ; b1 = 1 + 2
-
-        psubw       mm4,        mm2         ; c1 = 1 - 2
-        psubw       mm5,        mm3         ; d1 = 0 - 3
-
-        psllw       mm5,        3
-        psllw       mm4,        3
-
-        psllw       mm0,        3
-        psllw       mm1,        3
-
-        ; output 0 and 2
-        movq        mm2,        mm0         ; a1
-
-        paddw       mm0,        mm1         ; op[0] = a1 + b1
-        psubw       mm2,        mm1         ; op[2] = a1 - b1
-
-        ; output 1 and 3
-        ; interleave c1, d1
-        movq        mm1,        mm5         ; d1
-        punpcklwd   mm1,        mm4         ; c1 d1
-        punpckhwd   mm5,        mm4         ; c1 d1
-
-        movq        mm3,        mm1
-        movq        mm4,        mm5
-
-        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-
-        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-
-        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
-        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
-        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
-        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]
-
-        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-
-        packssdw    mm1,        mm4         ; op[1]
-        packssdw    mm3,        mm5         ; op[3]
-
-        ; done with vertical
-        ; transpose for the second stage
-        movq        mm4,        mm0         ; 00 10 20 30
-        movq        mm5,        mm2         ; 02 12 22 32
-
-        punpcklwd   mm0,        mm1         ; 00 01 10 11
-        punpckhwd   mm4,        mm1         ; 20 21 30 31
-
-        punpcklwd   mm2,        mm3         ; 02 03 12 13
-        punpckhwd   mm5,        mm3         ; 22 23 32 33
-
-        movq        mm1,        mm0         ; 00 01 10 11
-        punpckldq   mm0,        mm2         ; 00 01 02 03
-
-        punpckhdq   mm1,        mm2         ; 01 22 12 13
-
-        movq        mm2,        mm4         ; 20 31 30 31
-        punpckldq   mm2,        mm5         ; 20 21 22 23
-
-        punpckhdq   mm4,        mm5         ; 30 31 32 33
-
-        ; mm0 0
-        ; mm1 1
-        ; mm2 2
-        ; mm3 4
-
-        movq        mm5,        mm0
-        movq        mm3,        mm1
-
-        paddw       mm0,        mm4         ; a1 = 0 + 3
-        paddw       mm1,        mm2         ; b1 = 1 + 2
-
-        psubw       mm3,        mm2         ; c1 = 1 - 2
-        psubw       mm5,        mm4         ; d1 = 0 - 3
-
-        pxor        mm6,        mm6         ; zero out for compare
-
-        pcmpeqw     mm6,        mm5         ; d1 != 0
-
-        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
-                                                                ; and keep bit 0 of lower
-
-        ; output 0 and 2
-        movq        mm2,        mm0         ; a1
-
-        paddw       mm0,        mm1         ; a1 + b1
-        psubw       mm2,        mm1         ; a1 - b1
-
-        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
-        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]
-
-        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
-        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4
-
-        movq        MMWORD PTR[rdi + 0 ],  mm0
-        movq        MMWORD PTR[rdi + 16],  mm2
-
-        ; output 1 and 3
-        ; interleave c1, d1
-        movq        mm1,        mm5         ; d1
-        punpcklwd   mm1,        mm3         ; c1 d1
-        punpckhwd   mm5,        mm3         ; c1 d1
-
-        movq        mm3,        mm1
-        movq        mm4,        mm5
-
-        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-
-        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-
-        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
-        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
-        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
-        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]
-
-        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-
-        packssdw    mm1,        mm4         ; op[4]
-        packssdw    mm3,        mm5         ; op[12]
-
-        paddw       mm1,        mm6         ; op[4] += (d1!=0)
-
-        movq        MMWORD PTR[rdi + 8 ],  mm1
-        movq        MMWORD PTR[rdi + 24],  mm3
-
-     ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 8
-_5352_2217:
-    dw 5352
-    dw 2217
-    dw 5352
-    dw 2217
-align 8
-_2217_neg5352:
-    dw 2217
-    dw -5352
-    dw 2217
-    dw -5352
-align 8
-_cmp_mask:
-    times 4 dw 1
-align 8
-_7w:
-    times 4 dw 7
-align 8
-_14500:
-    times 2 dd 14500
-align 8
-_7500:
-    times 2 dd 7500
-align 8
-_12000:
-    times 2 dd 12000
-align 8
-_51000:
-    times 2 dd 51000
diff --git a/libs/libvpx/vp8/encoder/x86/dct_sse2.asm b/libs/libvpx/vp8/encoder/x86/dct_sse2.asm
index d06bca5927..4d92f0341d 100644
--- a/libs/libvpx/vp8/encoder/x86/dct_sse2.asm
+++ b/libs/libvpx/vp8/encoder/x86/dct_sse2.asm
@@ -60,6 +60,8 @@
     ret
 %endmacro
 
+SECTION .text
+
 ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
 global sym(vp8_short_fdct4x4_sse2) PRIVATE
 sym(vp8_short_fdct4x4_sse2):
diff --git a/libs/libvpx/vp8/encoder/x86/encodeopt.asm b/libs/libvpx/vp8/encoder/x86/encodeopt.asm
index fe26b18e56..f6c6aeae74 100644
--- a/libs/libvpx/vp8/encoder/x86/encodeopt.asm
+++ b/libs/libvpx/vp8/encoder/x86/encodeopt.asm
@@ -11,9 +11,11 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-;int vp8_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
-global sym(vp8_block_error_xmm) PRIVATE
-sym(vp8_block_error_xmm):
+SECTION .text
+
+;int vp8_block_error_sse2(short *coeff_ptr,  short *dcoef_ptr)
+global sym(vp8_block_error_sse2) PRIVATE
+sym(vp8_block_error_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 2
@@ -59,152 +61,9 @@ sym(vp8_block_error_xmm):
     pop         rbp
     ret
 
-;int vp8_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
-global sym(vp8_block_error_mmx) PRIVATE
-sym(vp8_block_error_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov         rsi,        arg(0) ;coeff_ptr
-        pxor        mm7,        mm7
-
-        mov         rdi,        arg(1) ;dcoef_ptr
-        movq        mm3,        [rsi]
-
-        movq        mm4,        [rdi]
-        movq        mm5,        [rsi+8]
-
-        movq        mm6,        [rdi+8]
-        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0
-
-        movq        mm2,        mm7
-        psubw       mm5,        mm6
-
-        por         mm1,        mm2
-        pmaddwd     mm5,        mm5
-
-        pcmpeqw     mm1,        mm7
-        psubw       mm3,        mm4
-
-        pand        mm1,        mm3
-        pmaddwd     mm1,        mm1
-
-        paddd       mm1,        mm5
-        movq        mm3,        [rsi+16]
-
-        movq        mm4,        [rdi+16]
-        movq        mm5,        [rsi+24]
-
-        movq        mm6,        [rdi+24]
-        psubw       mm5,        mm6
-
-        pmaddwd     mm5,        mm5
-        psubw       mm3,        mm4
-
-        pmaddwd     mm3,        mm3
-        paddd       mm3,        mm5
-
-        paddd       mm1,        mm3
-        movq        mm0,        mm1
-
-        psrlq       mm1,        32
-        paddd       mm0,        mm1
-
-        movq        rax,        mm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-global sym(vp8_mbblock_error_mmx_impl) PRIVATE
-sym(vp8_mbblock_error_mmx_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov         rsi,        arg(0) ;coeff_ptr
-        pxor        mm7,        mm7
-
-        mov         rdi,        arg(1) ;dcoef_ptr
-        pxor        mm2,        mm2
-
-        movd        mm1,        dword ptr arg(2) ;dc
-        por         mm1,        mm2
-
-        pcmpeqw     mm1,        mm7
-        mov         rcx,        16
-
-.mberror_loop_mmx:
-        movq        mm3,       [rsi]
-        movq        mm4,       [rdi]
-
-        movq        mm5,       [rsi+8]
-        movq        mm6,       [rdi+8]
-
-
-        psubw       mm5,        mm6
-        pmaddwd     mm5,        mm5
-
-        psubw       mm3,        mm4
-        pand        mm3,        mm1
-
-        pmaddwd     mm3,        mm3
-        paddd       mm2,        mm5
-
-        paddd       mm2,        mm3
-        movq        mm3,       [rsi+16]
-
-        movq        mm4,       [rdi+16]
-        movq        mm5,       [rsi+24]
-
-        movq        mm6,       [rdi+24]
-        psubw       mm5,        mm6
-
-        pmaddwd     mm5,        mm5
-        psubw       mm3,        mm4
-
-        pmaddwd     mm3,        mm3
-        paddd       mm2,        mm5
-
-        paddd       mm2,        mm3
-        add         rsi,        32
-
-        add         rdi,        32
-        sub         rcx,        1
-
-        jnz         .mberror_loop_mmx
-
-        movq        mm0,        mm2
-        psrlq       mm2,        32
-
-        paddd       mm0,        mm2
-        movq        rax,        mm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-global sym(vp8_mbblock_error_xmm_impl) PRIVATE
-sym(vp8_mbblock_error_xmm_impl):
+;int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_sse2_impl) PRIVATE
+sym(vp8_mbblock_error_sse2_impl):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 3
@@ -272,66 +131,9 @@ sym(vp8_mbblock_error_xmm_impl):
     ret
 
 
-;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-global sym(vp8_mbuverror_mmx_impl) PRIVATE
-sym(vp8_mbuverror_mmx_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;s_ptr
-        mov             rdi,        arg(1) ;d_ptr
-
-        mov             rcx,        16
-        pxor            mm7,        mm7
-
-.mbuverror_loop_mmx:
-
-        movq            mm1,        [rsi]
-        movq            mm2,        [rdi]
-
-        psubw           mm1,        mm2
-        pmaddwd         mm1,        mm1
-
-
-        movq            mm3,        [rsi+8]
-        movq            mm4,        [rdi+8]
-
-        psubw           mm3,        mm4
-        pmaddwd         mm3,        mm3
-
-
-        paddd           mm7,        mm1
-        paddd           mm7,        mm3
-
-
-        add             rsi,        16
-        add             rdi,        16
-
-        dec             rcx
-        jnz             .mbuverror_loop_mmx
-
-        movq            mm0,        mm7
-        psrlq           mm7,        32
-
-        paddd           mm0,        mm7
-        movq            rax,        mm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-global sym(vp8_mbuverror_xmm_impl) PRIVATE
-sym(vp8_mbuverror_xmm_impl):
+;int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_sse2_impl) PRIVATE
+sym(vp8_mbuverror_sse2_impl):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 2
diff --git a/libs/libvpx/vp8/encoder/x86/fwalsh_sse2.asm b/libs/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
index f4989279f4..b5d5de4a59 100644
--- a/libs/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
+++ b/libs/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
 global sym(vp8_short_walsh4x4_sse2) PRIVATE
 sym(vp8_short_walsh4x4_sse2):
diff --git a/libs/libvpx/vp8/encoder/x86/quantize_mmx.asm b/libs/libvpx/vp8/encoder/x86/quantize_mmx.asm
deleted file mode 100644
index 2864ce16d9..0000000000
--- a/libs/libvpx/vp8/encoder/x86/quantize_mmx.asm
+++ /dev/null
@@ -1,286 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-;                           short *qcoeff_ptr,short *dequant_ptr,
-;                           short *scan_mask, short *round_ptr,
-;                           short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE
-sym(vp8_fast_quantize_b_impl_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;coeff_ptr
-        movq            mm0,        [rsi]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm1,        [rax]
-
-        movq            mm3,        mm0
-        psraw           mm0,        15
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0         ; abs
-
-        movq            mm2,        mm3
-        pcmpgtw         mm1,        mm2
-
-        pandn           mm1,        mm2
-        movq            mm3,        mm1
-
-        mov             rdx,        arg(6) ;quant_ptr
-        movq            mm1,        [rdx]
-
-        mov             rcx,        arg(5) ;round_ptr
-        movq            mm2,        [rcx]
-
-        paddw           mm3,        mm2
-        pmulhuw         mm3,        mm1
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0     ;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-        movq            mm0,        mm3
-
-        movq            [rdi],      mm3
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm2,        [rax]
-
-        pmullw          mm3,        mm2
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax],      mm3
-
-        ; next 8
-        movq            mm4,        [rsi+8]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+8]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+8]
-        movq            mm6,        [rcx+8]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+8],    mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+8]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+8],    mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+16]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+16]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+16]
-        movq            mm6,        [rcx+16]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+16],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+16]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+16],   mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+24]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+24]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+24]
-        movq            mm6,        [rcx+24]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+24],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+24]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+24],   mm7
-
-
-
-        mov             rdi,        arg(4) ;scan_mask
-        mov             rsi,        arg(2) ;qcoeff_ptr
-
-        pxor            mm5,        mm5
-        pxor            mm7,        mm7
-
-        movq            mm0,        [rsi]
-        movq            mm1,        [rsi+8]
-
-        movq            mm2,        [rdi]
-        movq            mm3,        [rdi+8];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        movq            mm5,        mm0
-
-        paddd           mm5,        mm1
-
-        movq            mm0,        [rsi+16]
-        movq            mm1,        [rsi+24]
-
-        movq            mm2,        [rdi+16]
-        movq            mm3,        [rdi+24];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        paddd           mm5,        mm0
-
-        paddd           mm5,        mm1
-        movq            mm0,        mm5
-
-        psrlq           mm5,        32
-        paddd           mm0,        mm5
-
-        ; eob adjustment begins here
-        movq            rcx,        mm0
-        and             rcx,        0xffff
-
-        xor             rdx,        rdx
-        sub             rdx,        rcx ; rdx=-rcx
-
-        bsr             rax,        rcx
-        inc             rax
-
-        sar             rdx,        31
-        and             rax,        rdx
-        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
-        ; following is kept as reference
-        ;    movq            rcx,        mm0
-        ;    bsr             rax,        rcx
-        ;
-        ;    mov             eob,        rax
-        ;    mov             eee,        rcx
-        ;
-        ;if(eee==0)
-        ;{
-        ;    eob=-1;
-        ;}
-        ;else if(eee<0)
-        ;{
-        ;    eob=15;
-        ;}
-        ;d->eob = eob+1;
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/libs/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/libs/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index bd92b398a0..d2b4711b87 100644
--- a/libs/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/libs/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ; void vp8_temporal_filter_apply_sse2 | arg
 ;  (unsigned char  *frame1,           |  0
 ;   unsigned int    stride,           |  1
@@ -203,5 +205,5 @@ align 16
 _const_top_bit:
     times 8 dw 1<<15
 align 16
-_const_16w
+_const_16w:
     times 8 dw 16
diff --git a/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
deleted file mode 100644
index d00abd4d08..0000000000
--- a/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vpx_ports/x86.h"
-#include "vp8/encoder/block.h"
-
-void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
-void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch) {
-  vp8_short_fdct4x4_mmx(input, output, pitch);
-  vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
-}
-
-int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-                                 short *qcoeff_ptr, short *dequant_ptr,
-                                 const short *scan_mask, short *round_ptr,
-                                 short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) {
-  const short *scan_mask = vp8_default_zig_zag_mask;
-  short *coeff_ptr = b->coeff;
-  short *zbin_ptr = b->zbin;
-  short *round_ptr = b->round;
-  short *quant_ptr = b->quant_fast;
-  short *qcoeff_ptr = d->qcoeff;
-  short *dqcoeff_ptr = d->dqcoeff;
-  short *dequant_ptr = d->dequant;
-
-  *d->eob = (char)vp8_fast_quantize_b_impl_mmx(
-      coeff_ptr, zbin_ptr, qcoeff_ptr, dequant_ptr, scan_mask,
-
-      round_ptr, quant_ptr, dqcoeff_ptr);
-}
-
-int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc) {
-  short *coeff_ptr = mb->block[0].coeff;
-  short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
-  return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
-}
-
-int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-int vp8_mbuverror_mmx(MACROBLOCK *mb) {
-  short *s_ptr = &mb->coeff[256];
-  short *d_ptr = &mb->e_mbd.dqcoeff[256];
-  return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
-}
diff --git a/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c b/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c
index 29eb1f1b45..d0752453ee 100644
--- a/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c
+++ b/libs/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c
@@ -13,16 +13,16 @@
 #include "vpx_ports/x86.h"
 #include "vp8/encoder/block.h"
 
-int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc) {
+int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_sse2(MACROBLOCK *mb, int dc) {
   short *coeff_ptr = mb->block[0].coeff;
   short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
-  return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
+  return vp8_mbblock_error_sse2_impl(coeff_ptr, dcoef_ptr, dc);
 }
 
-int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-int vp8_mbuverror_xmm(MACROBLOCK *mb) {
+int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_sse2(MACROBLOCK *mb) {
   short *s_ptr = &mb->coeff[256];
   short *d_ptr = &mb->e_mbd.dqcoeff[256];
-  return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
+  return vp8_mbuverror_sse2_impl(s_ptr, d_ptr);
 }
diff --git a/libs/libvpx/vp8/encoder/x86/quantize_ssse3.c b/libs/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
similarity index 99%
rename from libs/libvpx/vp8/encoder/x86/quantize_ssse3.c
rename to libs/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
index 322f0a151f..d547450154 100644
--- a/libs/libvpx/vp8/encoder/x86/quantize_ssse3.c
+++ b/libs/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
@@ -10,6 +10,7 @@
 
 #include <tmmintrin.h> /* SSSE3 */
 
+#include "./vp8_rtcd.h"
 #include "vp8/encoder/block.h"
 
 /* bitscan reverse (bsr) */
diff --git a/libs/libvpx/vp8/vp8_common.mk b/libs/libvpx/vp8/vp8_common.mk
index 886b127d6a..246fe6a677 100644
--- a/libs/libvpx/vp8/vp8_common.mk
+++ b/libs/libvpx/vp8/vp8_common.mk
@@ -80,8 +80,6 @@ VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_loopfilter_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
@@ -118,6 +116,14 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
 
+# common (c)
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/sixtap_filter_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/loopfilter_filters_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idctllm_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/dequantize_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/copymem_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idct_blk_mmi.c
+
 ifeq ($(CONFIG_POSTPROC),yes)
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
 endif
diff --git a/libs/libvpx/vp8/vp8_cx_iface.c b/libs/libvpx/vp8/vp8_cx_iface.c
index fac237eec0..af6689fd97 100644
--- a/libs/libvpx/vp8/vp8_cx_iface.c
+++ b/libs/libvpx/vp8/vp8_cx_iface.c
@@ -40,6 +40,7 @@ struct vp8_extracfg {
   vp8e_tuning tuning;
   unsigned int cq_level; /* constrained quality level */
   unsigned int rc_max_intra_bitrate_pct;
+  unsigned int gf_cbr_boost_pct;
   unsigned int screen_content_mode;
 };
 
@@ -65,6 +66,7 @@ static struct vp8_extracfg default_extracfg = {
   0,  /* tuning*/
   10, /* cq_level */
   0,  /* rc_max_intra_bitrate_pct */
+  0,  /* gf_cbr_boost_pct */
   0,  /* screen_content_mode */
 };
 
@@ -315,6 +317,7 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
 
   oxcf->target_bandwidth = cfg.rc_target_bitrate;
   oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
+  oxcf->gf_cbr_boost_pct = vp8_cfg.gf_cbr_boost_pct;
 
   oxcf->best_allowed_q = cfg.rc_min_quantizer;
   oxcf->worst_allowed_q = cfg.rc_max_quantizer;
@@ -558,6 +561,13 @@ static vpx_codec_err_t set_rc_max_intra_bitrate_pct(vpx_codec_alg_priv_t *ctx,
   return update_extracfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(vpx_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.gf_cbr_boost_pct = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx,
                                                va_list args) {
   struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
@@ -1159,6 +1169,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
   { VP8E_SET_CQ_LEVEL, set_cq_level },
   { VP8E_SET_MAX_INTRA_BITRATE_PCT, set_rc_max_intra_bitrate_pct },
   { VP8E_SET_SCREEN_CONTENT_MODE, set_screen_content_mode },
+  { VP8E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
   { -1, NULL },
 };
 
@@ -1205,6 +1216,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
         50,  /* rc_two_pass_vbrbias  */
         0,   /* rc_two_pass_vbrmin_section */
         400, /* rc_two_pass_vbrmax_section */
+        0,   // rc_2pass_vbr_corpus_complexity (only has meaningfull for VP9)
 
         /* keyframing settings (kf) */
         VPX_KF_AUTO, /* g_kfmode*/
diff --git a/libs/libvpx/vp8/vp8_dx_iface.c b/libs/libvpx/vp8/vp8_dx_iface.c
index b1f8340d67..f20283c1e1 100644
--- a/libs/libvpx/vp8/vp8_dx_iface.c
+++ b/libs/libvpx/vp8/vp8_dx_iface.c
@@ -24,6 +24,7 @@
 #include "decoder/onyxd_int.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/system_state.h"
 #if CONFIG_ERROR_CONCEALMENT
 #include "decoder/error_concealment.h"
 #endif
@@ -79,7 +80,6 @@ static int vp8_init_ctx(vpx_codec_ctx_t *ctx) {
 static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
                                 vpx_codec_priv_enc_mr_cfg_t *data) {
   vpx_codec_err_t res = VPX_CODEC_OK;
-  vpx_codec_alg_priv_t *priv = NULL;
   (void)data;
 
   vp8_rtcd();
@@ -91,7 +91,10 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
    * information becomes known.
    */
   if (!ctx->priv) {
+    vpx_codec_alg_priv_t *priv;
+
     if (vp8_init_ctx(ctx)) return VPX_CODEC_MEM_ERROR;
+
     priv = (vpx_codec_alg_priv_t *)ctx->priv;
 
     /* initialize number of fragments to zero */
@@ -101,22 +104,6 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
         (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
 
     /*post processing level initialized to do nothing */
-  } else {
-    priv = (vpx_codec_alg_priv_t *)ctx->priv;
-  }
-
-  priv->yv12_frame_buffers.use_frame_threads =
-      (ctx->priv->init_flags & VPX_CODEC_USE_FRAME_THREADING);
-
-  /* for now, disable frame threading */
-  priv->yv12_frame_buffers.use_frame_threads = 0;
-
-  if (priv->yv12_frame_buffers.use_frame_threads &&
-      ((ctx->priv->init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT) ||
-       (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS))) {
-    /* row-based threading, error concealment, and input fragments will
-     * not be supported when using frame-based threading */
-    res = VPX_CODEC_INVALID_PARAM;
   }
 
   return res;
@@ -157,8 +144,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
     }
     si->is_kf = 0;
 
-    if (data_sz >= 10 && !(clear[0] & 0x01)) /* I-Frame */
-    {
+    if (data_sz >= 10 && !(clear[0] & 0x01)) { /* I-Frame */
       si->is_kf = 1;
 
       /* vet via sync code */
@@ -241,7 +227,8 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
 }
 
 static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
-                            unsigned int data_sz, vpx_codec_err_t *res) {
+                            unsigned int data_sz,
+                            volatile vpx_codec_err_t *res) {
   *res = VPX_CODEC_OK;
 
   if (ctx->fragments.count == 0) {
@@ -280,7 +267,7 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
 static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t *data, unsigned int data_sz,
                                   void *user_priv, long deadline) {
-  vpx_codec_err_t res = VPX_CODEC_OK;
+  volatile vpx_codec_err_t res;
   unsigned int resolution_change = 0;
   unsigned int w, h;
 
@@ -365,7 +352,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
            * reallocation is attempted on resync. */
           ctx->si.w = 0;
           ctx->si.h = 0;
-          vp8_clear_system_state();
+          vpx_clear_system_state();
           /* same return value as used in vp8dx_receive_compressed_data */
           return -1;
         }
@@ -427,7 +414,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
 #endif
 
 #if CONFIG_MULTITHREAD
-        if (pbi->b_multithreaded_rd) {
+        if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
           vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows);
         }
 #else
@@ -518,7 +505,7 @@ static vpx_codec_err_t vp8_set_reference(vpx_codec_alg_priv_t *ctx,
                                          va_list args) {
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
-  if (data && !ctx->yv12_frame_buffers.use_frame_threads) {
+  if (data) {
     vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
     YV12_BUFFER_CONFIG sd;
 
@@ -535,7 +522,7 @@ static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx,
                                          va_list args) {
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
-  if (data && !ctx->yv12_frame_buffers.use_frame_threads) {
+  if (data) {
     vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
     YV12_BUFFER_CONFIG sd;
 
@@ -548,6 +535,14 @@ static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx,
   }
 }
 
+static vpx_codec_err_t vp8_get_quantizer(vpx_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
+  *arg = vp8dx_get_quantizer(ctx->yv12_frame_buffers.pbi[0]);
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
                                         va_list args) {
 #if CONFIG_POSTPROC
@@ -572,7 +567,7 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
                                                 va_list args) {
   int *update_info = va_arg(args, int *);
 
-  if (update_info && !ctx->yv12_frame_buffers.use_frame_threads) {
+  if (update_info) {
     VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0];
 
     *update_info = pbi->common.refresh_alt_ref_frame * (int)VP8_ALTR_FRAME +
@@ -585,12 +580,11 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
   }
 }
 
-extern int vp8dx_references_buffer(VP8_COMMON *oci, int ref_frame);
 static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx,
                                               va_list args) {
   int *ref_info = va_arg(args, int *);
 
-  if (ref_info && !ctx->yv12_frame_buffers.use_frame_threads) {
+  if (ref_info) {
     VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0];
     VP8_COMMON *oci = &pbi->common;
     *ref_info =
@@ -640,6 +634,7 @@ vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = {
   { VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates },
   { VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted },
   { VP8D_GET_LAST_REF_USED, vp8_get_last_ref_frame },
+  { VPXD_GET_LAST_QUANTIZER, vp8_get_quantizer },
   { VPXD_SET_DECRYPTOR, vp8_set_decryptor },
   { -1, NULL },
 };
diff --git a/libs/libvpx/vp8/vp8cx.mk b/libs/libvpx/vp8/vp8cx.mk
index 4e3d31e3b7..0dac0169d5 100644
--- a/libs/libvpx/vp8/vp8cx.mk
+++ b/libs/libvpx/vp8/vp8cx.mk
@@ -30,6 +30,7 @@ VP8_CX_SRCS-yes += encoder/encodeintra.c
 VP8_CX_SRCS-yes += encoder/encodemb.c
 VP8_CX_SRCS-yes += encoder/encodemv.c
 VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c
+VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.h
 VP8_CX_SRCS-yes += encoder/firstpass.c
 VP8_CX_SRCS-yes += encoder/block.h
 VP8_CX_SRCS-yes += encoder/boolhuff.h
@@ -56,11 +57,14 @@ VP8_CX_SRCS-yes += encoder/modecosts.c
 VP8_CX_SRCS-yes += encoder/onyx_if.c
 VP8_CX_SRCS-yes += encoder/pickinter.c
 VP8_CX_SRCS-yes += encoder/picklpf.c
+VP8_CX_SRCS-yes += encoder/picklpf.h
 VP8_CX_SRCS-yes += encoder/vp8_quantize.c
 VP8_CX_SRCS-yes += encoder/ratectrl.c
 VP8_CX_SRCS-yes += encoder/rdopt.c
 VP8_CX_SRCS-yes += encoder/segmentation.c
 VP8_CX_SRCS-yes += encoder/segmentation.h
+VP8_CX_SRCS-yes += common/vp8_skin_detection.c
+VP8_CX_SRCS-yes += common/vp8_skin_detection.h
 VP8_CX_SRCS-yes += encoder/tokenize.c
 VP8_CX_SRCS-yes += encoder/dct_value_cost.h
 VP8_CX_SRCS-yes += encoder/dct_value_tokens.h
@@ -68,20 +72,20 @@ VP8_CX_SRCS-yes += encoder/treewriter.c
 VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
 VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
 VP8_CX_SRCS-yes += encoder/temporal_filter.c
+VP8_CX_SRCS-yes += encoder/temporal_filter.h
 VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
 VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
 VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
+VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.h
 endif
 
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp8_quantize_ssse3.c
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c
 
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
@@ -90,7 +94,6 @@ endif
 
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
@@ -107,6 +110,9 @@ VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
 
+VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/vp8_quantize_mmi.c
+VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/dct_mmi.c
+
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/denoising_msa.c
 endif
diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
index eca655e728..025254c3f3 100644
--- a/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+++ b/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -14,14 +14,7 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
-
-static int16_t sinpi_1_9 = 0x14a3;
-static int16_t sinpi_2_9 = 0x26c9;
-static int16_t sinpi_3_9 = 0x3441;
-static int16_t sinpi_4_9 = 0x3b6c;
-static int16_t cospi_8_64 = 0x3b21;
-static int16_t cospi_16_64 = 0x2d41;
-static int16_t cospi_24_64 = 0x187e;
+#include "vpx_dsp/txfm_common.h"
 
 static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
   int32x4_t q8s32, q9s32;
@@ -37,7 +30,6 @@ static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
 
   *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
   *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
-  return;
 }
 
 static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
@@ -45,7 +37,6 @@ static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
   *d0s16 = vdup_n_s16(cospi_8_64);
   *d1s16 = vdup_n_s16(cospi_16_64);
   *d2s16 = vdup_n_s16(cospi_24_64);
-  return;
 }
 
 static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
@@ -54,7 +45,6 @@ static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
   *d4s16 = vdup_n_s16(sinpi_2_9);
   *q3s16 = vdupq_n_s16(sinpi_3_9);
   *d5s16 = vdup_n_s16(sinpi_4_9);
-  return;
 }
 
 static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
@@ -80,17 +70,16 @@ static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
   q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
   q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
 
-  d26s16 = vqrshrn_n_s32(q13s32, 14);
-  d27s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d28s16 = vqrshrn_n_s32(q10s32, 14);
+  d26s16 = vrshrn_n_s32(q13s32, 14);
+  d27s16 = vrshrn_n_s32(q14s32, 14);
+  d29s16 = vrshrn_n_s32(q15s32, 14);
+  d28s16 = vrshrn_n_s32(q10s32, 14);
 
   q13s16 = vcombine_s16(d26s16, d27s16);
   q14s16 = vcombine_s16(d28s16, d29s16);
   *q8s16 = vaddq_s16(q13s16, q14s16);
   *q9s16 = vsubq_s16(q13s16, q14s16);
   *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16));  // vswp
-  return;
 }
 
 static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
@@ -129,18 +118,17 @@ static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
   q14s32 = vaddq_s32(q11s32, q12s32);
   q10s32 = vsubq_s32(q10s32, q12s32);
 
-  d16s16 = vqrshrn_n_s32(q13s32, 14);
-  d17s16 = vqrshrn_n_s32(q14s32, 14);
-  d18s16 = vqrshrn_n_s32(q15s32, 14);
-  d19s16 = vqrshrn_n_s32(q10s32, 14);
+  d16s16 = vrshrn_n_s32(q13s32, 14);
+  d17s16 = vrshrn_n_s32(q14s32, 14);
+  d18s16 = vrshrn_n_s32(q15s32, 14);
+  d19s16 = vrshrn_n_s32(q10s32, 14);
 
   *q8s16 = vcombine_s16(d16s16, d17s16);
   *q9s16 = vcombine_s16(d18s16, d19s16);
-  return;
 }
 
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
-                            int dest_stride, int tx_type) {
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
   uint8x8_t d26u8, d27u8;
   int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
   uint32x2_t d26u32, d27u32;
@@ -156,9 +144,8 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
 
   switch (tx_type) {
     case 0:  // idct_idct is not supported. Fall back to C
-      vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+      vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
       return;
-      break;
     case 1:  // iadst_idct
       // generate constants
       GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
@@ -209,11 +196,11 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
   q9s16 = vrshrq_n_s16(q9s16, 4);
 
   d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
-  dest += dest_stride;
+  dest += stride;
   d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
-  dest += dest_stride;
+  dest += stride;
   d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
-  dest += dest_stride;
+  dest += stride;
   d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
 
   q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
@@ -223,11 +210,10 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
   d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
 
   vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
-  dest -= dest_stride;
+  dest -= stride;
   vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
-  dest -= dest_stride;
+  dest -= stride;
   vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
-  dest -= dest_stride;
+  dest -= stride;
   vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
-  return;
 }
diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
index 0e3febf159..1c739861c3 100644
--- a/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
+++ b/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -76,10 +76,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
   q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
 
-  d8s16 = vqrshrn_n_s32(q2s32, 14);
-  d9s16 = vqrshrn_n_s32(q3s32, 14);
-  d10s16 = vqrshrn_n_s32(q5s32, 14);
-  d11s16 = vqrshrn_n_s32(q6s32, 14);
+  d8s16 = vrshrn_n_s32(q2s32, 14);
+  d9s16 = vrshrn_n_s32(q3s32, 14);
+  d10s16 = vrshrn_n_s32(q5s32, 14);
+  d11s16 = vrshrn_n_s32(q6s32, 14);
   q4s16 = vcombine_s16(d8s16, d9s16);
   q5s16 = vcombine_s16(d10s16, d11s16);
 
@@ -93,10 +93,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
   q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
 
-  d14s16 = vqrshrn_n_s32(q2s32, 14);
-  d15s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q13s32, 14);
+  d14s16 = vrshrn_n_s32(q2s32, 14);
+  d15s16 = vrshrn_n_s32(q3s32, 14);
+  d12s16 = vrshrn_n_s32(q9s32, 14);
+  d13s16 = vrshrn_n_s32(q13s32, 14);
   q6s16 = vcombine_s16(d12s16, d13s16);
   q7s16 = vcombine_s16(d14s16, d15s16);
 
@@ -115,10 +115,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   d0s16 = vdup_n_s16(cospi_24_64);
   d1s16 = vdup_n_s16(cospi_8_64);
 
-  d18s16 = vqrshrn_n_s32(q2s32, 14);
-  d19s16 = vqrshrn_n_s32(q3s32, 14);
-  d22s16 = vqrshrn_n_s32(q13s32, 14);
-  d23s16 = vqrshrn_n_s32(q15s32, 14);
+  d18s16 = vrshrn_n_s32(q2s32, 14);
+  d19s16 = vrshrn_n_s32(q3s32, 14);
+  d22s16 = vrshrn_n_s32(q13s32, 14);
+  d23s16 = vrshrn_n_s32(q15s32, 14);
   *q9s16 = vcombine_s16(d18s16, d19s16);
   *q11s16 = vcombine_s16(d22s16, d23s16);
 
@@ -132,10 +132,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
   q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
 
-  d26s16 = vqrshrn_n_s32(q2s32, 14);
-  d27s16 = vqrshrn_n_s32(q3s32, 14);
-  d30s16 = vqrshrn_n_s32(q8s32, 14);
-  d31s16 = vqrshrn_n_s32(q12s32, 14);
+  d26s16 = vrshrn_n_s32(q2s32, 14);
+  d27s16 = vrshrn_n_s32(q3s32, 14);
+  d30s16 = vrshrn_n_s32(q8s32, 14);
+  d31s16 = vrshrn_n_s32(q12s32, 14);
   *q13s16 = vcombine_s16(d26s16, d27s16);
   *q15s16 = vcombine_s16(d30s16, d31s16);
 
@@ -165,10 +165,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
   q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
 
-  d10s16 = vqrshrn_n_s32(q9s32, 14);
-  d11s16 = vqrshrn_n_s32(q10s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q12s32, 14);
+  d10s16 = vrshrn_n_s32(q9s32, 14);
+  d11s16 = vrshrn_n_s32(q10s32, 14);
+  d12s16 = vrshrn_n_s32(q11s32, 14);
+  d13s16 = vrshrn_n_s32(q12s32, 14);
   q5s16 = vcombine_s16(d10s16, d11s16);
   q6s16 = vcombine_s16(d12s16, d13s16);
 
@@ -180,7 +180,6 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   *q13s16 = vsubq_s16(q2s16, q5s16);
   *q14s16 = vsubq_s16(q1s16, q6s16);
   *q15s16 = vsubq_s16(q0s16, q7s16);
-  return;
 }
 
 static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
@@ -243,8 +242,8 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q1s32 = vsubq_s32(q1s32, q5s32);
   q2s32 = vsubq_s32(q2s32, q6s32);
 
-  d22s16 = vqrshrn_n_s32(q11s32, 14);
-  d23s16 = vqrshrn_n_s32(q12s32, 14);
+  d22s16 = vrshrn_n_s32(q11s32, 14);
+  d23s16 = vrshrn_n_s32(q12s32, 14);
   *q11s16 = vcombine_s16(d22s16, d23s16);
 
   q12s32 = vaddq_s32(q3s32, q7s32);
@@ -252,12 +251,12 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q3s32 = vsubq_s32(q3s32, q7s32);
   q4s32 = vsubq_s32(q4s32, q8s32);
 
-  d2s16 = vqrshrn_n_s32(q1s32, 14);
-  d3s16 = vqrshrn_n_s32(q2s32, 14);
-  d24s16 = vqrshrn_n_s32(q12s32, 14);
-  d25s16 = vqrshrn_n_s32(q15s32, 14);
-  d6s16 = vqrshrn_n_s32(q3s32, 14);
-  d7s16 = vqrshrn_n_s32(q4s32, 14);
+  d2s16 = vrshrn_n_s32(q1s32, 14);
+  d3s16 = vrshrn_n_s32(q2s32, 14);
+  d24s16 = vrshrn_n_s32(q12s32, 14);
+  d25s16 = vrshrn_n_s32(q15s32, 14);
+  d6s16 = vrshrn_n_s32(q3s32, 14);
+  d7s16 = vrshrn_n_s32(q4s32, 14);
   *q12s16 = vcombine_s16(d24s16, d25s16);
 
   d0s16 = vdup_n_s16(cospi_10_64);
@@ -292,10 +291,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q2s32 = vsubq_s32(q2s32, q10s32);
   q6s32 = vsubq_s32(q6s32, q9s32);
 
-  d28s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d4s16 = vqrshrn_n_s32(q2s32, 14);
-  d5s16 = vqrshrn_n_s32(q6s32, 14);
+  d28s16 = vrshrn_n_s32(q14s32, 14);
+  d29s16 = vrshrn_n_s32(q15s32, 14);
+  d4s16 = vrshrn_n_s32(q2s32, 14);
+  d5s16 = vrshrn_n_s32(q6s32, 14);
   *q14s16 = vcombine_s16(d28s16, d29s16);
 
   q9s32 = vaddq_s32(q4s32, q0s32);
@@ -306,10 +305,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   d30s16 = vdup_n_s16(cospi_8_64);
   d31s16 = vdup_n_s16(cospi_24_64);
 
-  d18s16 = vqrshrn_n_s32(q9s32, 14);
-  d19s16 = vqrshrn_n_s32(q10s32, 14);
-  d8s16 = vqrshrn_n_s32(q4s32, 14);
-  d9s16 = vqrshrn_n_s32(q5s32, 14);
+  d18s16 = vrshrn_n_s32(q9s32, 14);
+  d19s16 = vrshrn_n_s32(q10s32, 14);
+  d8s16 = vrshrn_n_s32(q4s32, 14);
+  d9s16 = vrshrn_n_s32(q5s32, 14);
   *q9s16 = vcombine_s16(d18s16, d19s16);
 
   q5s32 = vmull_s16(d2s16, d30s16);
@@ -342,10 +341,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q5s32 = vsubq_s32(q5s32, q1s32);
   q6s32 = vsubq_s32(q6s32, q3s32);
 
-  d18s16 = vqrshrn_n_s32(q14s32, 14);
-  d19s16 = vqrshrn_n_s32(q15s32, 14);
-  d10s16 = vqrshrn_n_s32(q5s32, 14);
-  d11s16 = vqrshrn_n_s32(q6s32, 14);
+  d18s16 = vrshrn_n_s32(q14s32, 14);
+  d19s16 = vrshrn_n_s32(q15s32, 14);
+  d10s16 = vrshrn_n_s32(q5s32, 14);
+  d11s16 = vrshrn_n_s32(q6s32, 14);
   *q9s16 = vcombine_s16(d18s16, d19s16);
 
   q1s32 = vaddq_s32(q7s32, q10s32);
@@ -353,10 +352,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q7s32 = vsubq_s32(q7s32, q10s32);
   q0s32 = vsubq_s32(q0s32, q2s32);
 
-  d28s16 = vqrshrn_n_s32(q1s32, 14);
-  d29s16 = vqrshrn_n_s32(q3s32, 14);
-  d14s16 = vqrshrn_n_s32(q7s32, 14);
-  d15s16 = vqrshrn_n_s32(q0s32, 14);
+  d28s16 = vrshrn_n_s32(q1s32, 14);
+  d29s16 = vrshrn_n_s32(q3s32, 14);
+  d14s16 = vrshrn_n_s32(q7s32, 14);
+  d15s16 = vrshrn_n_s32(q0s32, 14);
   *q14s16 = vcombine_s16(d28s16, d29s16);
 
   d30s16 = vdup_n_s16(cospi_16_64);
@@ -375,10 +374,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
   q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
 
-  d4s16 = vqrshrn_n_s32(q2s32, 14);
-  d5s16 = vqrshrn_n_s32(q3s32, 14);
-  d24s16 = vqrshrn_n_s32(q13s32, 14);
-  d25s16 = vqrshrn_n_s32(q1s32, 14);
+  d4s16 = vrshrn_n_s32(q2s32, 14);
+  d5s16 = vrshrn_n_s32(q3s32, 14);
+  d24s16 = vrshrn_n_s32(q13s32, 14);
+  d25s16 = vrshrn_n_s32(q1s32, 14);
   q2s16 = vcombine_s16(d4s16, d5s16);
   *q12s16 = vcombine_s16(d24s16, d25s16);
 
@@ -392,10 +391,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
   q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
 
-  d20s16 = vqrshrn_n_s32(q13s32, 14);
-  d21s16 = vqrshrn_n_s32(q1s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q0s32, 14);
+  d20s16 = vrshrn_n_s32(q13s32, 14);
+  d21s16 = vrshrn_n_s32(q1s32, 14);
+  d12s16 = vrshrn_n_s32(q11s32, 14);
+  d13s16 = vrshrn_n_s32(q0s32, 14);
   *q10s16 = vcombine_s16(d20s16, d21s16);
   q6s16 = vcombine_s16(d12s16, d13s16);
 
@@ -405,11 +404,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
   *q11s16 = vsubq_s16(q5s16, q2s16);
   *q13s16 = vsubq_s16(q5s16, q6s16);
   *q15s16 = vsubq_s16(q5s16, q4s16);
-  return;
 }
 
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
-                            int dest_stride, int tx_type) {
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
   int i;
   uint8_t *d1, *d2;
   uint8x8_t d0u8, d1u8, d2u8, d3u8;
@@ -431,9 +429,8 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
 
   switch (tx_type) {
     case 0:  // idct_idct is not supported. Fall back to C
-      vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+      vp9_iht8x8_64_add_c(input, dest, stride, tx_type);
       return;
-      break;
     case 1:  // iadst_idct
       // generate IDCT constants
       // GENERATE_IDCT_CONSTANTS
@@ -511,13 +508,13 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
     }
 
     d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
+    d1 += stride;
     d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
+    d1 += stride;
     d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
+    d1 += stride;
     d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
+    d1 += stride;
 
     q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
     q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
@@ -532,13 +529,12 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
     d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
 
     vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += dest_stride;
+    d2 += stride;
     vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += dest_stride;
+    d2 += stride;
     vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
+    d2 += stride;
     vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
+    d2 += stride;
   }
-  return;
 }
diff --git a/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
index 2d4839174d..f6b29265e6 100644
--- a/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
+++ b/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -21,8 +21,8 @@
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride, int tx_type) {
+void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride,
+                             int tx_type) {
   int i, j;
   DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
   int16_t *outptr = out;
@@ -37,7 +37,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
   switch (tx_type) {
     case DCT_DCT:  // DCT in both horizontal and vertical
       vpx_idct4_rows_dspr2(input, outptr);
-      vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride);
       break;
     case ADST_DCT:  // ADST in vertical, DCT in horizontal
       vpx_idct4_rows_dspr2(input, outptr);
@@ -48,8 +48,8 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
         iadst4_dspr2(outptr, temp_out);
 
         for (j = 0; j < 4; ++j)
-          dest[j * dest_stride + i] = clip_pixel(
-              ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]);
+          dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) +
+                                            dest[j * stride + i]);
 
         outptr += 4;
       }
@@ -66,7 +66,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
           temp_in[i * 4 + j] = out[j * 4 + i];
         }
       }
-      vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, stride);
       break;
     case ADST_ADST:  // ADST in both directions
       for (i = 0; i < 4; ++i) {
@@ -80,8 +80,8 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
         iadst4_dspr2(temp_in, temp_out);
 
         for (j = 0; j < 4; ++j)
-          dest[j * dest_stride + i] = clip_pixel(
-              ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]);
+          dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) +
+                                            dest[j * stride + i]);
       }
       break;
     default: printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n"); break;
diff --git a/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
index 86896f04ca..b945e307e6 100644
--- a/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/libs/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -20,8 +20,8 @@
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride, int tx_type) {
+void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride,
+                             int tx_type) {
   int i, j;
   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
   int16_t *outptr = out;
@@ -34,7 +34,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
   switch (tx_type) {
     case DCT_DCT:  // DCT in both horizontal and vertical
       idct8_rows_dspr2(input, outptr, 8);
-      idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+      idct8_columns_add_blk_dspr2(&out[0], dest, stride);
       break;
     case ADST_DCT:  // ADST in vertical, DCT in horizontal
       idct8_rows_dspr2(input, outptr, 8);
@@ -43,8 +43,8 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
         iadst8_dspr2(&out[i * 8], temp_out);
 
         for (j = 0; j < 8; ++j)
-          dest[j * dest_stride + i] = clip_pixel(
-              ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]);
+          dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) +
+                                            dest[j * stride + i]);
       }
       break;
     case DCT_ADST:  // DCT in vertical, ADST in horizontal
@@ -59,7 +59,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
           temp_in[i * 8 + j] = out[j * 8 + i];
         }
       }
-      idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+      idct8_columns_add_blk_dspr2(&temp_in[0], dest, stride);
       break;
     case ADST_ADST:  // ADST in both directions
       for (i = 0; i < 8; ++i) {
@@ -74,8 +74,8 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
         iadst8_dspr2(temp_in, temp_out);
 
         for (j = 0; j < 8; ++j)
-          dest[j * dest_stride + i] = clip_pixel(
-              ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]);
+          dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) +
+                                            dest[j * stride + i]);
       }
       break;
     default: printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); break;
diff --git a/libs/libvpx/vp9/common/vp9_alloccommon.c b/libs/libvpx/vp9/common/vp9_alloccommon.c
index 66aa733b9e..7345e259b6 100644
--- a/libs/libvpx/vp9/common/vp9_alloccommon.c
+++ b/libs/libvpx/vp9/common/vp9_alloccommon.c
@@ -17,24 +17,6 @@
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
-// frame reference count.
-void lock_buffer_pool(BufferPool *const pool) {
-#if CONFIG_MULTITHREAD
-  pthread_mutex_lock(&pool->pool_mutex);
-#else
-  (void)pool;
-#endif
-}
-
-void unlock_buffer_pool(BufferPool *const pool) {
-#if CONFIG_MULTITHREAD
-  pthread_mutex_unlock(&pool->pool_mutex);
-#else
-  (void)pool;
-#endif
-}
-
 void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) {
   const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
   const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
@@ -62,8 +44,7 @@ static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
   cm->prev_seg_map_idx = 1;
 
   cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
-  if (!cm->frame_parallel_decode)
-    cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+  cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
 
   return 0;
 }
@@ -77,20 +58,18 @@ static void free_seg_map(VP9_COMMON *cm) {
   }
 
   cm->current_frame_seg_map = NULL;
-
-  if (!cm->frame_parallel_decode) {
-    cm->last_frame_seg_map = NULL;
-  }
+  cm->last_frame_seg_map = NULL;
 }
 
 void vp9_free_ref_frame_buffers(BufferPool *pool) {
   int i;
 
   for (i = 0; i < FRAME_BUFFERS; ++i) {
-    if (pool->frame_bufs[i].ref_count > 0 &&
+    if (!pool->frame_bufs[i].released &&
         pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
       pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
       pool->frame_bufs[i].ref_count = 0;
+      pool->frame_bufs[i].released = 1;
     }
     vpx_free(pool->frame_bufs[i].mvs);
     pool->frame_bufs[i].mvs = NULL;
@@ -176,6 +155,9 @@ fail:
 }
 
 void vp9_remove_common(VP9_COMMON *cm) {
+#if CONFIG_VP9_POSTPROC
+  vp9_free_postproc_buffers(cm);
+#endif
   vp9_free_context_buffers(cm);
 
   vpx_free(cm->fc);
@@ -186,7 +168,7 @@ void vp9_remove_common(VP9_COMMON *cm) {
 
 void vp9_init_context_buffers(VP9_COMMON *cm) {
   cm->setup_mi(cm);
-  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+  if (cm->last_frame_seg_map)
     memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
 }
 
diff --git a/libs/libvpx/vp9/common/vp9_blockd.c b/libs/libvpx/vp9/common/vp9_blockd.c
index b0249687fd..4327599510 100644
--- a/libs/libvpx/vp9/common/vp9_blockd.c
+++ b/libs/libvpx/vp9/common/vp9_blockd.c
@@ -53,8 +53,9 @@ void vp9_foreach_transformed_block_in_plane(
   // the current block size extends into the UMV and we won't
   // visit the sub blocks that are wholly within the UMV.
   const int max_blocks_wide =
-      num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >>
-                                                       (5 + pd->subsampling_x));
+      num_4x4_w + (xd->mb_to_right_edge >= 0
+                       ? 0
+                       : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
   const int max_blocks_high =
       num_4x4_h + (xd->mb_to_bottom_edge >= 0
                        ? 0
diff --git a/libs/libvpx/vp9/common/vp9_debugmodes.c b/libs/libvpx/vp9/common/vp9_debugmodes.c
index 7d128c9f7f..28cd4a1924 100644
--- a/libs/libvpx/vp9/common/vp9_debugmodes.c
+++ b/libs/libvpx/vp9/common/vp9_debugmodes.c
@@ -34,7 +34,7 @@ static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor,
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(file, "%c ", prefix);
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(file, "%2d ", *((int *)((char *)(mi[0]) + member_offset)));
+      fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset)));
       mi++;
     }
     fprintf(file, "\n");
diff --git a/libs/libvpx/vp9/common/vp9_entropymode.c b/libs/libvpx/vp9/common/vp9_entropymode.c
index 22365efc50..47cd63e94f 100644
--- a/libs/libvpx/vp9/common/vp9_entropymode.c
+++ b/libs/libvpx/vp9/common/vp9_entropymode.c
@@ -179,29 +179,29 @@ static const vpx_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
   { 101, 21, 107, 181, 192, 103, 19, 67, 125 }  // y = tm
 };
 
-const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
-                                     [PARTITION_TYPES - 1] = {
-                                       // 8x8 -> 4x4
-                                       { 158, 97, 94 },  // a/l both not split
-                                       { 93, 24, 99 },   // a split, l not split
-                                       { 85, 119, 44 },  // l split, a not split
-                                       { 62, 59, 67 },   // a/l both split
-                                       // 16x16 -> 8x8
-                                       { 149, 53, 53 },  // a/l both not split
-                                       { 94, 20, 48 },   // a split, l not split
-                                       { 83, 53, 24 },   // l split, a not split
-                                       { 52, 18, 18 },   // a/l both split
-                                       // 32x32 -> 16x16
-                                       { 150, 40, 39 },  // a/l both not split
-                                       { 78, 12, 26 },   // a split, l not split
-                                       { 67, 33, 11 },   // l split, a not split
-                                       { 24, 7, 5 },     // a/l both split
-                                       // 64x64 -> 32x32
-                                       { 174, 35, 49 },  // a/l both not split
-                                       { 68, 11, 27 },   // a split, l not split
-                                       { 57, 15, 9 },    // l split, a not split
-                                       { 12, 3, 3 },     // a/l both split
-                                     };
+const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] =
+    {
+      // 8x8 -> 4x4
+      { 158, 97, 94 },  // a/l both not split
+      { 93, 24, 99 },   // a split, l not split
+      { 85, 119, 44 },  // l split, a not split
+      { 62, 59, 67 },   // a/l both split
+      // 16x16 -> 8x8
+      { 149, 53, 53 },  // a/l both not split
+      { 94, 20, 48 },   // a split, l not split
+      { 83, 53, 24 },   // l split, a not split
+      { 52, 18, 18 },   // a/l both split
+      // 32x32 -> 16x16
+      { 150, 40, 39 },  // a/l both not split
+      { 78, 12, 26 },   // a split, l not split
+      { 67, 33, 11 },   // l split, a not split
+      { 24, 7, 5 },     // a/l both split
+      // 64x64 -> 32x32
+      { 174, 35, 49 },  // a/l both not split
+      { 68, 11, 27 },   // a split, l not split
+      { 57, 15, 9 },    // l split, a not split
+      { 12, 3, 3 },     // a/l both split
+    };
 
 static const vpx_prob
     default_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] = {
@@ -428,7 +428,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
   vp9_clearall_segfeatures(&cm->seg);
   cm->seg.abs_delta = SEGMENT_DELTADATA;
 
-  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+  if (cm->last_frame_seg_map)
     memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
   if (cm->current_frame_seg_map)
@@ -457,7 +457,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
   }
 
   // prev_mip will only be allocated in encoder.
-  if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode)
+  if (frame_is_intra_only(cm) && cm->prev_mip)
     memset(cm->prev_mip, 0,
            cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
 
diff --git a/libs/libvpx/vp9/common/vp9_entropymv.h b/libs/libvpx/vp9/common/vp9_entropymv.h
index a8cc7e93a2..e2fe37a327 100644
--- a/libs/libvpx/vp9/common/vp9_entropymv.h
+++ b/libs/libvpx/vp9/common/vp9_entropymv.h
@@ -27,12 +27,9 @@ void vp9_init_mv_probs(struct VP9Common *cm);
 
 void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
 
-// Integer pel reference mv threshold for use of high-precision 1/8 mv
-#define COMPANDED_MVREF_THRESH 8
-
 static INLINE int use_mv_hp(const MV *ref) {
-  return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
-         (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;
+  const int kMvRefThresh = 64;  // threshold for use of high-precision 1/8 mv
+  return abs(ref->row) < kMvRefThresh && abs(ref->col) < kMvRefThresh;
 }
 
 #define MV_UPDATE_PROB 252
diff --git a/libs/libvpx/vp9/common/vp9_frame_buffers.c b/libs/libvpx/vp9/common/vp9_frame_buffers.c
index efcf2bf885..a254e79d20 100644
--- a/libs/libvpx/vp9/common/vp9_frame_buffers.c
+++ b/libs/libvpx/vp9/common/vp9_frame_buffers.c
@@ -52,14 +52,12 @@ int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
   if (i == int_fb_list->num_internal_frame_buffers) return -1;
 
   if (int_fb_list->int_fb[i].size < min_size) {
-    int_fb_list->int_fb[i].data =
-        (uint8_t *)vpx_realloc(int_fb_list->int_fb[i].data, min_size);
-    if (!int_fb_list->int_fb[i].data) return -1;
-
-    // This memset is needed for fixing valgrind error from C loop filter
+    vpx_free(int_fb_list->int_fb[i].data);
+    // The data must be zeroed to fix a valgrind error from the C loop filter
     // due to access uninitialized memory in frame border. It could be
-    // removed if border is totally removed.
-    memset(int_fb_list->int_fb[i].data, 0, min_size);
+    // skipped if border were totally removed.
+    int_fb_list->int_fb[i].data = (uint8_t *)vpx_calloc(1, min_size);
+    if (!int_fb_list->int_fb[i].data) return -1;
     int_fb_list->int_fb[i].size = min_size;
   }
 
diff --git a/libs/libvpx/vp9/common/vp9_idct.c b/libs/libvpx/vp9/common/vp9_idct.c
index dc2e03946d..69069042cc 100644
--- a/libs/libvpx/vp9/common/vp9_idct.c
+++ b/libs/libvpx/vp9/common/vp9_idct.c
@@ -156,6 +156,8 @@ void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
     vpx_idct16x16_1_add(input, dest, stride);
   else if (eob <= 10)
     vpx_idct16x16_10_add(input, dest, stride);
+  else if (eob <= 38)
+    vpx_idct16x16_38_add(input, dest, stride);
   else
     vpx_idct16x16_256_add(input, dest, stride);
 }
@@ -203,18 +205,7 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 
-// 12 signal input bits + 7 forward transform amplify bits + 1 bit
-// for contingency in rounding and quantizing
-#define VALID_IHT_MAGNITUDE_RANGE (1 << 20)
-
-static INLINE int detect_invalid_iht_input(const tran_low_t *input, int size) {
-  int i;
-  for (i = 0; i < size; ++i)
-    if (abs(input[i]) >= VALID_IHT_MAGNITUDE_RANGE) return 1;
-  return 0;
-}
-
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
                                 int stride, int tx_type, int bd) {
   const highbd_transform_2d IHT_4[] = {
     { vpx_highbd_idct4_c, vpx_highbd_idct4_c },   // DCT_DCT  = 0
@@ -222,20 +213,12 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
     { vpx_highbd_idct4_c, vpx_highbd_iadst4_c },  // DCT_ADST = 2
     { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c }  // ADST_ADST = 3
   };
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   int i, j;
   tran_low_t out[4 * 4];
   tran_low_t *outptr = out;
   tran_low_t temp_in[4], temp_out[4];
 
-  if (detect_invalid_iht_input(input, 16)) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-    assert(0 && "invalid highbd iht input");
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-    return;
-  }
-
   // Inverse transform row vectors.
   for (i = 0; i < 4; ++i) {
     IHT_4[tx_type].rows(input, outptr, bd);
@@ -261,21 +244,13 @@ static const highbd_transform_2d HIGH_IHT_8[] = {
   { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c }  // ADST_ADST = 3
 };
 
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
                                 int stride, int tx_type, int bd) {
   int i, j;
   tran_low_t out[8 * 8];
   tran_low_t *outptr = out;
   tran_low_t temp_in[8], temp_out[8];
   const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  if (detect_invalid_iht_input(input, 64)) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-    assert(0 && "invalid highbd iht input");
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-    return;
-  }
 
   // Inverse transform row vectors.
   for (i = 0; i < 8; ++i) {
@@ -302,21 +277,13 @@ static const highbd_transform_2d HIGH_IHT_16[] = {
   { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c }  // ADST_ADST = 3
 };
 
-void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
                                    int stride, int tx_type, int bd) {
   int i, j;
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
   tran_low_t temp_in[16], temp_out[16];
   const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  if (detect_invalid_iht_input(input, 256)) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-    assert(0 && "invalid highbd iht input");
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-    return;
-  }
 
   // Rows
   for (i = 0; i < 16; ++i) {
@@ -337,7 +304,7 @@ void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
 }
 
 // idct
-void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd) {
   if (eob > 1)
     vpx_highbd_idct4x4_16_add(input, dest, stride, bd);
@@ -345,7 +312,7 @@ void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
     vpx_highbd_idct4x4_1_add(input, dest, stride, bd);
 }
 
-void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd) {
   if (eob > 1)
     vpx_highbd_iwht4x4_16_add(input, dest, stride, bd);
@@ -353,7 +320,7 @@ void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
     vpx_highbd_iwht4x4_1_add(input, dest, stride, bd);
 }
 
-void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd) {
   // If dc is 1, then input[0] is the reconstructed value, do not need
   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
@@ -363,14 +330,14 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
   // DC only DCT coefficient
   if (eob == 1) {
     vpx_highbd_idct8x8_1_add(input, dest, stride, bd);
-  } else if (eob <= 10) {
-    vpx_highbd_idct8x8_10_add(input, dest, stride, bd);
+  } else if (eob <= 12) {
+    vpx_highbd_idct8x8_12_add(input, dest, stride, bd);
   } else {
     vpx_highbd_idct8x8_64_add(input, dest, stride, bd);
   }
 }
 
-void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,
                               int stride, int eob, int bd) {
   // The calculation can be simplified if there are not many non-zero dct
   // coefficients. Use eobs to separate different cases.
@@ -379,18 +346,22 @@ void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
     vpx_highbd_idct16x16_1_add(input, dest, stride, bd);
   } else if (eob <= 10) {
     vpx_highbd_idct16x16_10_add(input, dest, stride, bd);
+  } else if (eob <= 38) {
+    vpx_highbd_idct16x16_38_add(input, dest, stride, bd);
   } else {
     vpx_highbd_idct16x16_256_add(input, dest, stride, bd);
   }
 }
 
-void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,
                               int stride, int eob, int bd) {
   // Non-zero coeff only in upper-left 8x8
   if (eob == 1) {
     vpx_highbd_idct32x32_1_add(input, dest, stride, bd);
   } else if (eob <= 34) {
     vpx_highbd_idct32x32_34_add(input, dest, stride, bd);
+  } else if (eob <= 135) {
+    vpx_highbd_idct32x32_135_add(input, dest, stride, bd);
   } else {
     vpx_highbd_idct32x32_1024_add(input, dest, stride, bd);
   }
@@ -398,7 +369,7 @@ void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
 
 // iht
 void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
-                           uint8_t *dest, int stride, int eob, int bd) {
+                           uint16_t *dest, int stride, int eob, int bd) {
   if (tx_type == DCT_DCT)
     vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);
   else
@@ -406,7 +377,7 @@ void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
 }
 
 void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
-                           uint8_t *dest, int stride, int eob, int bd) {
+                           uint16_t *dest, int stride, int eob, int bd) {
   if (tx_type == DCT_DCT) {
     vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);
   } else {
@@ -415,7 +386,7 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
 }
 
 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
-                             uint8_t *dest, int stride, int eob, int bd) {
+                             uint16_t *dest, int stride, int eob, int bd) {
   if (tx_type == DCT_DCT) {
     vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);
   } else {
diff --git a/libs/libvpx/vp9/common/vp9_idct.h b/libs/libvpx/vp9/common/vp9_idct.h
index ea958a38c0..3e83b8402d 100644
--- a/libs/libvpx/vp9/common/vp9_idct.h
+++ b/libs/libvpx/vp9/common/vp9_idct.h
@@ -57,22 +57,22 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                       int stride, int eob);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd);
-void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd);
-void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd);
-void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,
                               int stride, int eob, int bd);
-void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,
                               int stride, int eob, int bd);
 void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
-                           uint8_t *dest, int stride, int eob, int bd);
+                           uint16_t *dest, int stride, int eob, int bd);
 void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
-                           uint8_t *dest, int stride, int eob, int bd);
+                           uint16_t *dest, int stride, int eob, int bd);
 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
-                             uint8_t *dest, int stride, int eob, int bd);
+                             uint16_t *dest, int stride, int eob, int bd);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libs/libvpx/vp9/common/vp9_loopfilter.c b/libs/libvpx/vp9/common/vp9_loopfilter.c
index d796d7161c..c7c343aed5 100644
--- a/libs/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libs/libvpx/vp9/common/vp9_loopfilter.c
@@ -465,12 +465,11 @@ static void filter_selectively_horiz(
 
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_16_dual(s, pitch, lfi->mblim, lfi->lim,
                                      lfi->hev_thr);
           count = 2;
         } else {
-          vpx_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr);
+          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
         }
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
@@ -559,12 +558,12 @@ static void highbd_filter_selectively_horiz(
 
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_16_dual(s, pitch, lfi->mblim, lfi->lim,
                                             lfi->hev_thr, bd);
           count = 2;
         } else {
-          vpx_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,
-                                           lfi->hev_thr, bd);
+          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, bd);
         }
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
@@ -1613,12 +1612,14 @@ void vp9_loop_filter_data_reset(
 
 void vp9_reset_lfm(VP9_COMMON *const cm) {
   if (cm->lf.filter_level) {
-    memset(cm->lf.lfm, 0, ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) *
-                              cm->lf.lfm_stride * sizeof(*cm->lf.lfm));
+    memset(cm->lf.lfm, 0,
+           ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride *
+               sizeof(*cm->lf.lfm));
   }
 }
 
-int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
+int vp9_loop_filter_worker(void *arg1, void *unused) {
+  LFWorkerData *const lf_data = (LFWorkerData *)arg1;
   (void)unused;
   loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                    lf_data->start, lf_data->stop, lf_data->y_only);
diff --git a/libs/libvpx/vp9/common/vp9_loopfilter.h b/libs/libvpx/vp9/common/vp9_loopfilter.h
index da37a6ebde..481a6cdc63 100644
--- a/libs/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libs/libvpx/vp9/common/vp9_loopfilter.h
@@ -151,8 +151,8 @@ void vp9_loop_filter_data_reset(
     LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
     struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
 
-// Operates on the rows described by 'lf_data'.
-int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
+// Operates on the rows described by 'arg1' (cast to LFWorkerData *).
+int vp9_loop_filter_worker(void *arg1, void *unused);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libs/libvpx/vp9/common/vp9_onyxc_int.h b/libs/libvpx/vp9/common/vp9_onyxc_int.h
index 32db7b7aa7..1d96d92c24 100644
--- a/libs/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libs/libvpx/vp9/common/vp9_onyxc_int.h
@@ -37,13 +37,10 @@ extern "C" {
 #define REF_FRAMES_LOG2 3
 #define REF_FRAMES (1 << REF_FRAMES_LOG2)
 
-// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
-// in parallel, 3 for scaled references on the encoder.
-// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
-// of framebuffers.
+// 1 scratch frame for the new frame, 3 for scaled references on the encoder.
 // TODO(jkoleszar): These 3 extra references could probably come from the
 // normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 7)
+#define FRAME_BUFFERS (REF_FRAMES + 4)
 
 #define FRAME_CONTEXTS_LOG2 2
 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
@@ -72,30 +69,12 @@ typedef struct {
   MV_REF *mvs;
   int mi_rows;
   int mi_cols;
+  uint8_t released;
   vpx_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
-
-  // The Following variables will only be used in frame parallel decode.
-
-  // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
-  // that no FrameWorker owns, or is decoding, this buffer.
-  VPxWorker *frame_worker_owner;
-
-  // row and col indicate which position frame has been decoded to in real
-  // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
-  // when the frame is fully decoded.
-  int row;
-  int col;
 } RefCntBuffer;
 
 typedef struct BufferPool {
-// Protect BufferPool from being accessed by several FrameWorkers at
-// the same time during frame parallel decode.
-// TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t pool_mutex;
-#endif
-
   // Private data associated with the frame buffer callbacks.
   void *cb_priv;
 
@@ -235,10 +214,6 @@ typedef struct VP9Common {
   struct loopfilter lf;
   struct segmentation seg;
 
-  // TODO(hkuang): Remove this as it is the same as frame_parallel_decode
-  // in pbi.
-  int frame_parallel_decode;  // frame-based threading.
-
   // Context probabilities for reference frame prediction
   MV_REFERENCE_FRAME comp_fixed_ref;
   MV_REFERENCE_FRAME comp_var_ref[2];
@@ -283,11 +258,6 @@ typedef struct VP9Common {
   int above_context_alloc_cols;
 } VP9_COMMON;
 
-// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
-// frame reference count.
-void lock_buffer_pool(BufferPool *const pool);
-void unlock_buffer_pool(BufferPool *const pool);
-
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {
   if (index < 0 || index >= REF_FRAMES) return NULL;
   if (cm->ref_frame_map[index] < 0) return NULL;
@@ -303,7 +273,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) {
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
   int i;
 
-  lock_buffer_pool(cm->buffer_pool);
   for (i = 0; i < FRAME_BUFFERS; ++i)
     if (frame_bufs[i].ref_count == 0) break;
 
@@ -314,7 +283,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) {
     i = INVALID_IDX;
   }
 
-  unlock_buffer_pool(cm->buffer_pool);
   return i;
 }
 
@@ -342,7 +310,7 @@ static INLINE void set_partition_probs(const VP9_COMMON *const cm,
   xd->partition_probs =
       frame_is_intra_only(cm)
           ? &vp9_kf_partition_probs[0]
-          : (const vpx_prob(*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
+          : (const vpx_prob(*)[PARTITION_TYPES - 1]) cm->fc->partition_prob;
 }
 
 static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd,
diff --git a/libs/libvpx/vp9/common/vp9_postproc.c b/libs/libvpx/vp9/common/vp9_postproc.c
index b105e5d45a..dfc315eeac 100644
--- a/libs/libvpx/vp9/common/vp9_postproc.c
+++ b/libs/libvpx/vp9/common/vp9_postproc.c
@@ -380,7 +380,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
     // if mfqe is enabled. Need to take both the quality and the speed
     // into consideration.
     if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
-      vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+      vpx_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
     }
     if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
       deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
@@ -390,7 +390,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
       vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q,
                   cm->postproc_state.limits);
     } else {
-      vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+      vpx_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
     }
   } else if (flags & VP9D_DEMACROBLOCK) {
     deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
@@ -399,7 +399,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
   } else if (flags & VP9D_DEBLOCK) {
     vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits);
   } else {
-    vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
+    vpx_yv12_copy_frame(cm->frame_to_show, ppbuf);
   }
 
   ppstate->last_base_qindex = cm->base_qindex;
diff --git a/libs/libvpx/vp9/common/vp9_ppflags.h b/libs/libvpx/vp9/common/vp9_ppflags.h
index 6dcfa412be..b8b647bf18 100644
--- a/libs/libvpx/vp9/common/vp9_ppflags.h
+++ b/libs/libvpx/vp9/common/vp9_ppflags.h
@@ -20,14 +20,7 @@ enum {
   VP9D_DEBLOCK = 1 << 0,
   VP9D_DEMACROBLOCK = 1 << 1,
   VP9D_ADDNOISE = 1 << 2,
-  VP9D_DEBUG_TXT_FRAME_INFO = 1 << 3,
-  VP9D_DEBUG_TXT_MBLK_MODES = 1 << 4,
-  VP9D_DEBUG_TXT_DC_DIFF = 1 << 5,
-  VP9D_DEBUG_TXT_RATE_INFO = 1 << 6,
-  VP9D_DEBUG_DRAW_MV = 1 << 7,
-  VP9D_DEBUG_CLR_BLK_MODES = 1 << 8,
-  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
-  VP9D_MFQE = 1 << 10
+  VP9D_MFQE = 1 << 3
 };
 
 typedef struct {
diff --git a/libs/libvpx/vp9/common/vp9_reconinter.c b/libs/libvpx/vp9/common/vp9_reconinter.c
index 8eb7126898..a108a65153 100644
--- a/libs/libvpx/vp9/common/vp9_reconinter.c
+++ b/libs/libvpx/vp9/common/vp9_reconinter.c
@@ -21,7 +21,7 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_build_inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
     const InterpKernel *kernel, enum mv_precision precision, int x, int y,
     int bd) {
@@ -190,7 +190,8 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
 
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      highbd_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+      highbd_inter_predictor(CONVERT_TO_SHORTPTR(pre), pre_buf->stride,
+                             CONVERT_TO_SHORTPTR(dst), dst_buf->stride,
                              subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
                              xd->bd);
     } else {
diff --git a/libs/libvpx/vp9/common/vp9_reconinter.h b/libs/libvpx/vp9/common/vp9_reconinter.h
index 4fed4f7f6e..bb9291a264 100644
--- a/libs/libvpx/vp9/common/vp9_reconinter.h
+++ b/libs/libvpx/vp9/common/vp9_reconinter.h
@@ -26,19 +26,19 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
                                    const struct scale_factors *sf, int w, int h,
                                    int ref, const InterpKernel *kernel, int xs,
                                    int ys) {
-  sf->predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
-      ys, w, h);
+  sf->predict[subpel_x != 0][subpel_y != 0][ref](src, src_stride, dst,
+                                                 dst_stride, kernel, subpel_x,
+                                                 xs, subpel_y, ys, w, h);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static INLINE void highbd_inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     const int subpel_x, const int subpel_y, const struct scale_factors *sf,
     int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) {
   sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
-      ys, w, h, bd);
+      src, src_stride, dst, dst_stride, kernel, subpel_x, xs, subpel_y, ys, w,
+      h, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -68,7 +68,7 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_build_inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg,
     const InterpKernel *kernel, enum mv_precision precision, int x, int y,
     int bd);
diff --git a/libs/libvpx/vp9/common/vp9_rtcd_defs.pl b/libs/libvpx/vp9/common/vp9_rtcd_defs.pl
index 37a867323d..22b67ecace 100644
--- a/libs/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/libs/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
 sub vp9_common_forward_decls() {
 print <<EOF
 /*
@@ -7,6 +17,7 @@ print <<EOF
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
 
 struct macroblockd;
 
@@ -29,6 +40,7 @@ if ($opts{arch} eq "x86_64") {
   $ssse3_x86_64 = 'ssse3';
   $avx_x86_64 = 'avx';
   $avx2_x86_64 = 'avx2';
+  $avx512_x86_64 = 'avx512';
 }
 
 #
@@ -45,47 +57,24 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
 #
 # dct
 #
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
-  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp9_iht4x4_16_add/;
+# Force C versions if CONFIG_EMULATE_HARDWARE is 1
+add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
 
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp9_iht8x8_64_add/;
+add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
 
-    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/vp9_iht16x16_256_add/;
-  } else {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp9_iht4x4_16_add sse2/;
+add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
 
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp9_iht8x8_64_add sse2/;
-
-    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/vp9_iht16x16_256_add sse2/;
-  }
-} else {
-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
-  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp9_iht4x4_16_add/;
-
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp9_iht8x8_64_add/;
-
-    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/vp9_iht16x16_256_add/;
-  } else {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/;
+if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+  # Note that there are more specializations appended when
+  # CONFIG_VP9_HIGHBITDEPTH is off.
+  specialize qw/vp9_iht4x4_16_add sse2/;
+  specialize qw/vp9_iht8x8_64_add sse2/;
+  specialize qw/vp9_iht16x16_256_add sse2/;
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
+    # Note that these specializations are appended to the above ones.
+    specialize qw/vp9_iht4x4_16_add neon dspr2 msa/;
+    specialize qw/vp9_iht8x8_64_add neon dspr2 msa/;
+    specialize qw/vp9_iht16x16_256_add dspr2 msa/;
   }
 }
 
@@ -96,13 +85,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
     add_proto qw/void vp9_highbd_mbpost_proc_down/, "uint16_t *dst, int pitch, int rows, int cols, int flimit";
-    specialize qw/vp9_highbd_mbpost_proc_down/;
 
     add_proto qw/void vp9_highbd_mbpost_proc_across_ip/, "uint16_t *src, int pitch, int rows, int cols, int flimit";
-    specialize qw/vp9_highbd_mbpost_proc_across_ip/;
 
     add_proto qw/void vp9_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
-    specialize qw/vp9_highbd_post_proc_down_and_across/;
   }
 
   #
@@ -110,14 +96,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Note as optimized versions of these functions are added we need to add a check to ensure
   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
-  add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
-  specialize qw/vp9_highbd_iht4x4_16_add/;
+  add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
 
-  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
-  specialize qw/vp9_highbd_iht8x8_64_add/;
+  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
 
-  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
-  specialize qw/vp9_highbd_iht16x16_256_add/;
+  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
 }
 
 #
@@ -132,111 +115,91 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
 #
 if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
   add_proto qw/int vp9_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude";
-  specialize qw/vp9_denoiser_filter sse2/;
+  specialize qw/vp9_denoiser_filter neon sse2/;
 }
 
+add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+
+add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
+
+add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
+
+add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64";
+
+add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp9_block_error/;
+  specialize qw/vp9_block_error avx2 sse2/;
+
+  specialize qw/vp9_block_error_fp avx2 sse2/;
+
+  specialize qw/vp9_fdct8x8_quant neon ssse3/;
 
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
   specialize qw/vp9_highbd_block_error sse2/;
-
-  add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp9_highbd_block_error_8bit sse2 avx/;
-
-  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp/;
-
-  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp_32x32/;
-
-  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_fdct8x8_quant/;
 } else {
-  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
   specialize qw/vp9_block_error avx2 msa sse2/;
 
-  add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
-  specialize qw/vp9_block_error_fp neon sse2/;
+  specialize qw/vp9_block_error_fp neon avx2 sse2/;
 
-  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
-
-  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
-
-  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
 }
 
 # fdct functions
 
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht4x4 sse2/;
+add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 
-  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht8x8 sse2/;
+add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 
-  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht16x16 sse2/;
+add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 
-  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4 sse2/;
-} else {
-  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht4x4 sse2 msa/;
+add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
-  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht8x8 sse2 msa/;
-
-  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht16x16 sse2 msa/;
-
-  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4 msa sse2/;
+# Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH
+# is off.
+specialize qw/vp9_fht4x4 sse2/;
+specialize qw/vp9_fht8x8 sse2/;
+specialize qw/vp9_fht16x16 sse2/;
+specialize qw/vp9_fwht4x4 sse2/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
+  # Note that these specializations are appended to the above ones.
+  specialize qw/vp9_fht4x4 msa/;
+  specialize qw/vp9_fht8x8 msa/;
+  specialize qw/vp9_fht16x16 msa/;
+  specialize qw/vp9_fwht4x4 msa/;
 }
 
 #
 # Motion search
 #
-add_proto qw/int vp9_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv";
-specialize qw/vp9_full_search_sad sse3 sse4_1/;
-$vp9_full_search_sad_sse3=vp9_full_search_sadx3;
-$vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
-
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/vp9_diamond_search_sad avx/;
 
-add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
-specialize qw/vp9_temporal_filter_apply sse2 msa/;
+if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
+add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count";
+specialize qw/vp9_temporal_filter_apply sse4_1/;
+}
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
   # ENCODEMB INVOKE
 
-  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_fp/;
+  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 
-  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_fp_32x32/;
+  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
 
   # fdct functions
   add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht4x4/;
 
   add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht8x8/;
 
   add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht16x16/;
 
   add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fwht4x4/;
 
-  add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
-  specialize qw/vp9_highbd_temporal_filter_apply/;
+  add_proto qw/void vp9_highbd_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count";
 
 }
 # End vp9_high encoder functions
@@ -244,11 +207,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 # frame based scale
 #
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-} else {
-  add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst";
-  specialize qw/vp9_scale_and_extend_frame ssse3/;
-}
+add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler";
+specialize qw/vp9_scale_and_extend_frame neon ssse3/;
 
 }
 # end encoder functions
diff --git a/libs/libvpx/vp9/common/vp9_thread_common.c b/libs/libvpx/vp9/common/vp9_thread_common.c
index 07e659d235..8d44e91f2e 100644
--- a/libs/libvpx/vp9/common/vp9_thread_common.c
+++ b/libs/libvpx/vp9/common/vp9_thread_common.c
@@ -140,8 +140,9 @@ static INLINE void thread_loop_filter_rows(
 }
 
 // Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(VP9LfSync *const lf_sync,
-                                  LFWorkerData *const lf_data) {
+static int loop_filter_row_worker(void *arg1, void *arg2) {
+  VP9LfSync *const lf_sync = (VP9LfSync *)arg1;
+  LFWorkerData *const lf_data = (LFWorkerData *)arg2;
   thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                           lf_data->start, lf_data->stop, lf_data->y_only,
                           lf_sync);
@@ -183,7 +184,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
     VPxWorker *const worker = &workers[i];
     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
 
-    worker->hook = (VPxWorkerHook)loop_filter_row_worker;
+    worker->hook = loop_filter_row_worker;
     worker->data1 = lf_sync;
     worker->data2 = lf_data;
 
diff --git a/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index dcfc454aa0..6996260e26 100644
--- a/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -16,11 +16,10 @@
 void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   __m128i in[2];
-  const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
 
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8);
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8);
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -49,48 +48,23 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   in[0] = _mm_srai_epi16(in[0], 4);
   in[1] = _mm_srai_epi16(in[1], 4);
 
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, in[0]);
-    d2 = _mm_add_epi16(d2, in[1]);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store result[0]
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store result[1]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store result[2]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-    // store result[3]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-  }
+  recon_and_store4x4_sse2(in, dest, stride);
 }
 
 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   __m128i in[8];
-  const __m128i zero = _mm_setzero_si128();
   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
 
   // load input data
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 1);
-  in[2] = load_input_data(input + 8 * 2);
-  in[3] = load_input_data(input + 8 * 3);
-  in[4] = load_input_data(input + 8 * 4);
-  in[5] = load_input_data(input + 8 * 5);
-  in[6] = load_input_data(input + 8 * 6);
-  in[7] = load_input_data(input + 8 * 7);
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8 * 1);
+  in[2] = load_input_data8(input + 8 * 2);
+  in[3] = load_input_data8(input + 8 * 3);
+  in[4] = load_input_data8(input + 8 * 4);
+  in[5] = load_input_data8(input + 8 * 5);
+  in[6] = load_input_data8(input + 8 * 6);
+  in[7] = load_input_data8(input + 8 * 7);
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -131,14 +105,91 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   in[6] = _mm_srai_epi16(in[6], 5);
   in[7] = _mm_srai_epi16(in[7], 5);
 
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
+  recon_and_store(dest + 0 * stride, in[0]);
+  recon_and_store(dest + 1 * stride, in[1]);
+  recon_and_store(dest + 2 * stride, in[2]);
+  recon_and_store(dest + 3 * stride, in[3]);
+  recon_and_store(dest + 4 * stride, in[4]);
+  recon_and_store(dest + 5 * stride, in[5]);
+  recon_and_store(dest + 6 * stride, in[6]);
+  recon_and_store(dest + 7 * stride, in[7]);
+}
+
+static INLINE void load_buffer_8x16(const tran_low_t *const input,
+                                    __m128i *const in) {
+  in[0] = load_input_data8(input + 0 * 16);
+  in[1] = load_input_data8(input + 1 * 16);
+  in[2] = load_input_data8(input + 2 * 16);
+  in[3] = load_input_data8(input + 3 * 16);
+  in[4] = load_input_data8(input + 4 * 16);
+  in[5] = load_input_data8(input + 5 * 16);
+  in[6] = load_input_data8(input + 6 * 16);
+  in[7] = load_input_data8(input + 7 * 16);
+
+  in[8] = load_input_data8(input + 8 * 16);
+  in[9] = load_input_data8(input + 9 * 16);
+  in[10] = load_input_data8(input + 10 * 16);
+  in[11] = load_input_data8(input + 11 * 16);
+  in[12] = load_input_data8(input + 12 * 16);
+  in[13] = load_input_data8(input + 13 * 16);
+  in[14] = load_input_data8(input + 14 * 16);
+  in[15] = load_input_data8(input + 15 * 16);
+}
+
+static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in,
+                                     const int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+  in[8] = _mm_adds_epi16(in[8], final_rounding);
+  in[9] = _mm_adds_epi16(in[9], final_rounding);
+  in[10] = _mm_adds_epi16(in[10], final_rounding);
+  in[11] = _mm_adds_epi16(in[11], final_rounding);
+  in[12] = _mm_adds_epi16(in[12], final_rounding);
+  in[13] = _mm_adds_epi16(in[13], final_rounding);
+  in[14] = _mm_adds_epi16(in[14], final_rounding);
+  in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 6);
+  in[1] = _mm_srai_epi16(in[1], 6);
+  in[2] = _mm_srai_epi16(in[2], 6);
+  in[3] = _mm_srai_epi16(in[3], 6);
+  in[4] = _mm_srai_epi16(in[4], 6);
+  in[5] = _mm_srai_epi16(in[5], 6);
+  in[6] = _mm_srai_epi16(in[6], 6);
+  in[7] = _mm_srai_epi16(in[7], 6);
+  in[8] = _mm_srai_epi16(in[8], 6);
+  in[9] = _mm_srai_epi16(in[9], 6);
+  in[10] = _mm_srai_epi16(in[10], 6);
+  in[11] = _mm_srai_epi16(in[11], 6);
+  in[12] = _mm_srai_epi16(in[12], 6);
+  in[13] = _mm_srai_epi16(in[13], 6);
+  in[14] = _mm_srai_epi16(in[14], 6);
+  in[15] = _mm_srai_epi16(in[15], 6);
+
+  recon_and_store(dest + 0 * stride, in[0]);
+  recon_and_store(dest + 1 * stride, in[1]);
+  recon_and_store(dest + 2 * stride, in[2]);
+  recon_and_store(dest + 3 * stride, in[3]);
+  recon_and_store(dest + 4 * stride, in[4]);
+  recon_and_store(dest + 5 * stride, in[5]);
+  recon_and_store(dest + 6 * stride, in[6]);
+  recon_and_store(dest + 7 * stride, in[7]);
+  recon_and_store(dest + 8 * stride, in[8]);
+  recon_and_store(dest + 9 * stride, in[9]);
+  recon_and_store(dest + 10 * stride, in[10]);
+  recon_and_store(dest + 11 * stride, in[11]);
+  recon_and_store(dest + 12 * stride, in[12]);
+  recon_and_store(dest + 13 * stride, in[13]);
+  recon_and_store(dest + 14 * stride, in[14]);
+  recon_and_store(dest + 15 * stride, in[15]);
 }
 
 void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
diff --git a/libs/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/libs/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
index 30852049b4..ca0897ab9f 100644
--- a/libs/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
+++ b/libs/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
@@ -12,6 +12,8 @@
 ;  TODO(jackychen): Find a way to fix the duplicate.
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void vp9_filter_by_weight16x16_sse2
 ;(
 ;    unsigned char *src,
diff --git a/libs/libvpx/vp9/decoder/vp9_decodeframe.c b/libs/libvpx/vp9/decoder/vp9_decodeframe.c
index fde0b7e318..d0e896c13f 100644
--- a/libs/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/libs/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -189,21 +189,22 @@ static void inverse_transform_block_inter(MACROBLOCKD *xd, int plane,
   assert(eob > 0);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
     if (xd->lossless) {
-      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+      vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
     } else {
       switch (tx_size) {
         case TX_4X4:
-          vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
           break;
         case TX_8X8:
-          vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct8x8_add(dqcoeff, dst16, stride, eob, xd->bd);
           break;
         case TX_16X16:
-          vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct16x16_add(dqcoeff, dst16, stride, eob, xd->bd);
           break;
         case TX_32X32:
-          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd);
           break;
         default: assert(0 && "Invalid transform size");
       }
@@ -256,21 +257,22 @@ static void inverse_transform_block_intra(MACROBLOCKD *xd, int plane,
   assert(eob > 0);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
     if (xd->lossless) {
-      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+      vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
     } else {
       switch (tx_size) {
         case TX_4X4:
-          vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);
           break;
         case TX_8X8:
-          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);
           break;
         case TX_16X16:
-          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);
           break;
         case TX_32X32:
-          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd);
           break;
         default: assert(0 && "Invalid transform size");
       }
@@ -451,24 +453,19 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
                                const struct scale_factors *sf, MACROBLOCKD *xd,
                                int w, int h, int ref, int xs, int ys) {
   DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
-  const uint8_t *buf_ptr;
 
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w, x0, y0,
                          b_w, b_h, frame_width, frame_height);
-    buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset;
+    highbd_inter_predictor(mc_buf_high + border_offset, b_w,
+                           CONVERT_TO_SHORTPTR(dst), dst_buf_stride, subpel_x,
+                           subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
   } else {
     build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w, x0,
                     y0, b_w, b_h, frame_width, frame_height);
-    buf_ptr = ((uint8_t *)mc_buf_high) + border_offset;
-  }
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    highbd_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
-                           subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
-  } else {
-    inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, subpel_y, sf,
-                    w, h, ref, kernel, xs, ys);
+    inter_predictor(((uint8_t *)mc_buf_high) + border_offset, b_w, dst,
+                    dst_buf_stride, subpel_x, subpel_y, sf, w, h, ref, kernel,
+                    xs, ys);
   }
 }
 #else
@@ -493,8 +490,8 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static void dec_build_inter_predictors(
-    VPxWorker *const worker, MACROBLOCKD *xd, int plane, int bw, int bh, int x,
-    int y, int w, int h, int mi_x, int mi_y, const InterpKernel *kernel,
+    MACROBLOCKD *xd, int plane, int bw, int bh, int x, int y, int w, int h,
+    int mi_x, int mi_y, const InterpKernel *kernel,
     const struct scale_factors *sf, struct buf_2d *pre_buf,
     struct buf_2d *dst_buf, const MV *mv, RefCntBuffer *ref_frame_buf,
     int is_scaled, int ref) {
@@ -596,12 +593,6 @@ static void dec_build_inter_predictors(
       y_pad = 1;
     }
 
-    // Wait until reference block is ready. Pad 7 more pixels as last 7
-    // pixels of each superblock row can be changed by next superblock row.
-    if (worker != NULL)
-      vp9_frameworker_wait(worker, ref_frame_buf, VPXMAX(0, (y1 + 7))
-                                                      << (plane == 0 ? 0 : 1));
-
     // Skip border extension if block is inside the frame.
     if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
         y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
@@ -620,18 +611,11 @@ static void dec_build_inter_predictors(
                          w, h, ref, xs, ys);
       return;
     }
-  } else {
-    // Wait until reference block is ready. Pad 7 more pixels as last 7
-    // pixels of each superblock row can be changed by next superblock row.
-    if (worker != NULL) {
-      const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
-      vp9_frameworker_wait(worker, ref_frame_buf, VPXMAX(0, (y1 + 7))
-                                                      << (plane == 0 ? 0 : 1));
-    }
   }
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    highbd_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+    highbd_inter_predictor(CONVERT_TO_SHORTPTR(buf_ptr), buf_stride,
+                           CONVERT_TO_SHORTPTR(dst), dst_buf->stride, subpel_x,
                            subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
   } else {
     inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
@@ -655,8 +639,6 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
   const int is_compound = has_second_ref(mi);
   int ref;
   int is_scaled;
-  VPxWorker *const fwo =
-      pbi->frame_parallel_decode ? pbi->frame_worker_owner : NULL;
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
@@ -688,10 +670,10 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
         for (y = 0; y < num_4x4_h; ++y) {
           for (x = 0; x < num_4x4_w; ++x) {
             const MV mv = average_split_mvs(pd, mi, ref, i++);
-            dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, 4 * x,
-                                       4 * y, 4, 4, mi_x, mi_y, kernel, sf,
-                                       pre_buf, dst_buf, &mv, ref_frame_buf,
-                                       is_scaled, ref);
+            dec_build_inter_predictors(xd, plane, n4w_x4, n4h_x4, 4 * x, 4 * y,
+                                       4, 4, mi_x, mi_y, kernel, sf, pre_buf,
+                                       dst_buf, &mv, ref_frame_buf, is_scaled,
+                                       ref);
           }
         }
       }
@@ -705,7 +687,7 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
         const int n4w_x4 = 4 * num_4x4_w;
         const int n4h_x4 = 4 * num_4x4_h;
         struct buf_2d *const pre_buf = &pd->pre[ref];
-        dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4,
+        dec_build_inter_predictors(xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4,
                                    n4h_x4, mi_x, mi_y, kernel, sf, pre_buf,
                                    dst_buf, &mv, ref_frame_buf, is_scaled, ref);
       }
@@ -838,8 +820,9 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
                              : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
         const int max_blocks_high =
             num_4x4_h +
-            (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >>
-                                                  (5 + pd->subsampling_y));
+            (xd->mb_to_bottom_edge >= 0
+                 ? 0
+                 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
         xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide;
         xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high;
@@ -1188,7 +1171,6 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   resize_context_buffers(cm, width, height);
   setup_render_size(cm, rb);
 
-  lock_buffer_pool(pool);
   if (vpx_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
           cm->subsampling_y,
@@ -1198,12 +1180,11 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
           VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
           &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
           pool->cb_priv)) {
-    unlock_buffer_pool(pool);
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
-  unlock_buffer_pool(pool);
 
+  pool->frame_bufs[cm->new_fb_idx].released = 0;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
@@ -1274,7 +1255,6 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
   resize_context_buffers(cm, width, height);
   setup_render_size(cm, rb);
 
-  lock_buffer_pool(pool);
   if (vpx_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
           cm->subsampling_y,
@@ -1284,12 +1264,11 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
           VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
           &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
           pool->cb_priv)) {
-    unlock_buffer_pool(pool);
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
-  unlock_buffer_pool(pool);
 
+  pool->frame_bufs[cm->new_fb_idx].released = 0;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
@@ -1385,7 +1364,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
       pbi->lf_worker.data1 == NULL) {
     CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
                     vpx_memalign(32, sizeof(LFWorkerData)));
-    pbi->lf_worker.hook = (VPxWorkerHook)vp9_loop_filter_worker;
+    pbi->lf_worker.hook = vp9_loop_filter_worker;
     if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
       vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                          "Loop filter thread creation failed");
@@ -1474,11 +1453,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
           winterface->execute(&pbi->lf_worker);
         }
       }
-      // After loopfiltering, the last 7 row pixels in each superblock row may
-      // still be changed by the longest loopfilter of the next superblock
-      // row.
-      if (pbi->frame_parallel_decode)
-        vp9_frameworker_broadcast(pbi->cur_buf, mi_row << MI_BLOCK_SIZE_LOG2);
     }
   }
 
@@ -1494,16 +1468,16 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
   // Get last tile data.
   tile_data = pbi->tile_worker_data + tile_cols * tile_rows - 1;
 
-  if (pbi->frame_parallel_decode)
-    vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
   return vpx_reader_find_end(&tile_data->bit_reader);
 }
 
 // On entry 'tile_data->data_end' points to the end of the input frame, on exit
 // it is updated to reflect the bitreader position of the final tile column if
 // present in the tile buffer group or NULL otherwise.
-static int tile_worker_hook(TileWorkerData *const tile_data,
-                            VP9Decoder *const pbi) {
+static int tile_worker_hook(void *arg1, void *arg2) {
+  TileWorkerData *const tile_data = (TileWorkerData *)arg1;
+  VP9Decoder *const pbi = (VP9Decoder *)arg2;
+
   TileInfo *volatile tile = &tile_data->xd.tile;
   const int final_col = (1 << pbi->common.log2_tile_cols) - 1;
   const uint8_t *volatile bit_reader_end = NULL;
@@ -1517,7 +1491,6 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
     return 0;
   }
 
-  tile_data->xd.error_info = &tile_data->error_info;
   tile_data->xd.corrupted = 0;
 
   do {
@@ -1529,6 +1502,8 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
                         &tile_data->error_info, &tile_data->bit_reader,
                         pbi->decrypt_cb, pbi->decrypt_state);
     vp9_init_macroblockd(&pbi->common, &tile_data->xd, tile_data->dqcoeff);
+    // init resets xd.error_info
+    tile_data->xd.error_info = &tile_data->error_info;
 
     for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
          mi_row += MI_BLOCK_SIZE) {
@@ -1596,7 +1571,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
     tile_data->xd = pbi->mb;
     tile_data->xd.counts =
         cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts;
-    worker->hook = (VPxWorkerHook)tile_worker_hook;
+    worker->hook = tile_worker_hook;
     worker->data1 = tile_data;
     worker->data2 = pbi;
   }
@@ -1779,24 +1754,17 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
   if (cm->show_existing_frame) {
     // Show an existing frame directly.
     const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)];
-    lock_buffer_pool(pool);
     if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
-      unlock_buffer_pool(pool);
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Buffer %d does not contain a decoded frame",
                          frame_to_show);
     }
 
     ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
-    unlock_buffer_pool(pool);
     pbi->refresh_frame_flags = 0;
     cm->lf.filter_level = 0;
     cm->show_frame = 1;
 
-    if (pbi->frame_parallel_decode) {
-      for (i = 0; i < REF_FRAMES; ++i)
-        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
-    }
     return 0;
   }
 
@@ -1913,7 +1881,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
   cm->frame_context_idx = vpx_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
 
   // Generate next_ref_frame_map.
-  lock_buffer_pool(pool);
   for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
     if (mask & 1) {
       cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
@@ -1933,7 +1900,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
     if (cm->ref_frame_map[ref_index] >= 0)
       ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
   }
-  unlock_buffer_pool(pool);
   pbi->hold_ref_buf = 1;
 
   if (frame_is_intra_only(cm) || cm->error_resilient_mode)
@@ -2090,24 +2056,6 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
     vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
   }
 
-  // If encoded in frame parallel mode, frame context is ready after decoding
-  // the frame header.
-  if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
-    VPxWorker *const worker = pbi->frame_worker_owner;
-    FrameWorkerData *const frame_worker_data = worker->data1;
-    if (cm->refresh_frame_context) {
-      context_updated = 1;
-      cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-    }
-    vp9_frameworker_lock_stats(worker);
-    pbi->cur_buf->row = -1;
-    pbi->cur_buf->col = -1;
-    frame_worker_data->frame_context_ready = 1;
-    // Signal the main thread that context is ready.
-    vp9_frameworker_signal_stats(worker);
-    vp9_frameworker_unlock_stats(worker);
-  }
-
   if (pbi->tile_worker_data == NULL ||
       (tile_cols * tile_rows) != pbi->total_tiles) {
     const int num_tile_workers =
diff --git a/libs/libvpx/vp9/decoder/vp9_decodemv.c b/libs/libvpx/vp9/decoder/vp9_decodemv.c
index 4372ba0371..0a781413b1 100644
--- a/libs/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/libs/libvpx/vp9/decoder/vp9_decodemv.c
@@ -455,12 +455,6 @@ static void dec_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *best_mv,
   }
 }
 
-static void fpm_sync(void *const data, int mi_row) {
-  VP9Decoder *const pbi = (VP9Decoder *)data;
-  vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
-                       mi_row << MI_BLOCK_SIZE_LOG2);
-}
-
 // This macro is used to add a motion vector mv_ref list if it isn't
 // already in the list.  If it's the second motion vector or early_break
 // it will also skip all additional processing and jump to Done!
@@ -500,8 +494,7 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                             PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame,
                             const POSITION *const mv_ref_search,
                             int_mv *mv_ref_list, int mi_row, int mi_col,
-                            int block, int is_sub8x8, find_mv_refs_sync sync,
-                            void *const data) {
+                            int block, int is_sub8x8) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
   int different_ref_found = 0;
@@ -557,23 +550,8 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     }
   }
 
-// TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
-// on windows platform. The sync here is unnecessary if use_prev_frame_mvs
-// is 0. But after removing it, there will be hang in the unit test on windows
-// due to several threads waiting for a thread's signal.
-#if defined(_WIN32) && !HAVE_PTHREAD_H
-  if (cm->frame_parallel_decode && sync != NULL) {
-    sync(data, mi_row);
-  }
-#endif
-
   // Check the last frame's mode and mv info.
   if (prev_frame_mvs) {
-    // Synchronize here for frame parallel decode if sync function is provided.
-    if (cm->frame_parallel_decode && sync != NULL) {
-      sync(data, mi_row);
-    }
-
     if (prev_frame_mvs->ref_frame[0] == ref_frame) {
       ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
     } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
@@ -652,7 +630,7 @@ static void append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
 
   refmv_count =
       dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search,
-                       mv_list, mi_row, mi_col, block, 1, NULL, NULL);
+                       mv_list, mi_row, mi_col, block, 1);
 
   switch (block) {
     case 0: best_sub8x8->as_int = mv_list[refmv_count - 1].as_int; break;
@@ -750,9 +728,8 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
         const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
         int refmv_count;
 
-        refmv_count =
-            dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search, tmp_mvs,
-                             mi_row, mi_col, -1, 0, fpm_sync, (void *)pbi);
+        refmv_count = dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search,
+                                       tmp_mvs, mi_row, mi_col, -1, 0);
 
         dec_find_best_ref_mvs(allow_hp, tmp_mvs, &best_ref_mvs[ref],
                               refmv_count);
@@ -770,6 +747,10 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
     int idx, idy;
     PREDICTION_MODE b_mode;
     int_mv best_sub8x8[2];
+    const uint32_t invalid_mv = 0x80008000;
+    // Initialize the 2nd element as even though it won't be used meaningfully
+    // if is_compound is false, copying/clamping it may trigger a MSan warning.
+    best_sub8x8[1].as_int = invalid_mv;
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int j = idy * 2 + idx;
diff --git a/libs/libvpx/vp9/decoder/vp9_decoder.c b/libs/libvpx/vp9/decoder/vp9_decoder.c
index 37693f0944..a913fa560c 100644
--- a/libs/libvpx/vp9/decoder/vp9_decoder.c
+++ b/libs/libvpx/vp9/decoder/vp9_decoder.c
@@ -139,6 +139,7 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
     vp9_loop_filter_dealloc(&pbi->lf_row_sync);
   }
 
+  vp9_remove_common(&pbi->common);
   vpx_free(pbi);
 }
 
@@ -169,7 +170,7 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi,
       vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                          "Incorrect buffer dimensions");
     else
-      vp8_yv12_copy_frame(cfg, sd);
+      vpx_yv12_copy_frame(cfg, sd);
   } else {
     vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Invalid reference frame");
   }
@@ -217,7 +218,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
                        "Incorrect buffer dimensions");
   } else {
     // Overwrite the reference frame buffer.
-    vp8_yv12_copy_frame(sd, ref_buf);
+    vpx_yv12_copy_frame(sd, ref_buf);
   }
 
   return cm->error.error_code;
@@ -230,7 +231,6 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
   BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
-  lock_buffer_pool(pool);
   for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
     const int old_idx = cm->ref_frame_map[ref_index];
     // Current thread releases the holding of reference frame.
@@ -250,15 +250,10 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
     decrease_ref_count(old_idx, frame_bufs, pool);
     cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
   }
-  unlock_buffer_pool(pool);
   pbi->hold_ref_buf = 0;
   cm->frame_to_show = get_frame_new_buffer(cm);
 
-  if (!pbi->frame_parallel_decode || !cm->show_frame) {
-    lock_buffer_pool(pool);
-    --frame_bufs[cm->new_fb_idx].ref_count;
-    unlock_buffer_pool(pool);
-  }
+  --frame_bufs[cm->new_fb_idx].ref_count;
 
   // Invalidate these references until the next frame starts.
   for (ref_index = 0; ref_index < 3; ref_index++)
@@ -292,11 +287,13 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
   pbi->ready_for_new_data = 0;
 
   // Check if the previous frame was a frame without any references to it.
-  // Release frame buffer if not decoding in frame parallel mode.
-  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0 &&
-      frame_bufs[cm->new_fb_idx].ref_count == 0)
+  if (cm->new_fb_idx >= 0 && frame_bufs[cm->new_fb_idx].ref_count == 0 &&
+      !frame_bufs[cm->new_fb_idx].released) {
     pool->release_fb_cb(pool->cb_priv,
                         &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+    frame_bufs[cm->new_fb_idx].released = 1;
+  }
+
   // Find a free frame buffer. Return error if can not find any.
   cm->new_fb_idx = get_free_fb(cm);
   if (cm->new_fb_idx == INVALID_IDX) {
@@ -309,18 +306,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
 
   pbi->hold_ref_buf = 0;
-  if (pbi->frame_parallel_decode) {
-    VPxWorker *const worker = pbi->frame_worker_owner;
-    vp9_frameworker_lock_stats(worker);
-    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
-    // Reset decoding progress.
-    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
-    pbi->cur_buf->row = -1;
-    pbi->cur_buf->col = -1;
-    vp9_frameworker_unlock_stats(worker);
-  } else {
-    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
-  }
+  pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
 
   if (setjmp(cm->error.jmp)) {
     const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
@@ -336,7 +322,6 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
       winterface->sync(&pbi->tile_workers[i]);
     }
 
-    lock_buffer_pool(pool);
     // Release all the reference buffers if worker thread is holding them.
     if (pbi->hold_ref_buf == 1) {
       int ref_index = 0, mask;
@@ -361,7 +346,6 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
     }
     // Release current frame.
     decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
-    unlock_buffer_pool(pool);
 
     vpx_clear_system_state();
     return -1;
@@ -377,31 +361,14 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
   if (!cm->show_existing_frame) {
     cm->last_show_frame = cm->show_frame;
     cm->prev_frame = cm->cur_frame;
-    if (cm->seg.enabled && !pbi->frame_parallel_decode)
-      vp9_swap_current_and_last_seg_map(cm);
+    if (cm->seg.enabled) vp9_swap_current_and_last_seg_map(cm);
   }
 
   // Update progress in frame parallel decode.
-  if (pbi->frame_parallel_decode) {
-    // Need to lock the mutex here as another thread may
-    // be accessing this buffer.
-    VPxWorker *const worker = pbi->frame_worker_owner;
-    FrameWorkerData *const frame_worker_data = worker->data1;
-    vp9_frameworker_lock_stats(worker);
-
-    if (cm->show_frame) {
-      cm->current_video_frame++;
-    }
-    frame_worker_data->frame_decoded = 1;
-    frame_worker_data->frame_context_ready = 1;
-    vp9_frameworker_signal_stats(worker);
-    vp9_frameworker_unlock_stats(worker);
-  } else {
-    cm->last_width = cm->width;
-    cm->last_height = cm->height;
-    if (cm->show_frame) {
-      cm->current_video_frame++;
-    }
+  cm->last_width = cm->width;
+  cm->last_height = cm->height;
+  if (cm->show_frame) {
+    cm->current_video_frame++;
   }
 
   cm->error.setjmp = 0;
diff --git a/libs/libvpx/vp9/decoder/vp9_decoder.h b/libs/libvpx/vp9/decoder/vp9_decoder.h
index 427baf1e0b..4b26c314d3 100644
--- a/libs/libvpx/vp9/decoder/vp9_decoder.h
+++ b/libs/libvpx/vp9/decoder/vp9_decoder.h
@@ -21,7 +21,6 @@
 #include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
-#include "vp9/decoder/vp9_dthread.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -53,13 +52,10 @@ typedef struct VP9Decoder {
 
   int refresh_frame_flags;
 
-  int frame_parallel_decode;  // frame-based threading.
-
   // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
   // the same.
   RefCntBuffer *cur_buf;  //  Current decoding frame buffer.
 
-  VPxWorker *frame_worker_owner;  // frame_worker that owns this pbi.
   VPxWorker lf_worker;
   VPxWorker *tile_workers;
   TileWorkerData *tile_worker_data;
@@ -121,9 +117,10 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
     // But the private buffer is not set up until finish decoding header.
     // So any error happens during decoding header, the frame_bufs will not
     // have valid priv buffer.
-    if (frame_bufs[idx].ref_count == 0 &&
+    if (!frame_bufs[idx].released && frame_bufs[idx].ref_count == 0 &&
         frame_bufs[idx].raw_frame_buffer.priv) {
       pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+      frame_bufs[idx].released = 1;
     }
   }
 }
diff --git a/libs/libvpx/vp9/decoder/vp9_detokenize.c b/libs/libvpx/vp9/decoder/vp9_detokenize.c
index 7048fb1ca0..4bd016dc7d 100644
--- a/libs/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/libs/libvpx/vp9/decoder/vp9_detokenize.c
@@ -99,9 +99,10 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
                                     vp9_cat6_prob;
   const int cat6_bits =
 #if CONFIG_VP9_HIGHBITDEPTH
-      (xd->bd == VPX_BITS_12) ? 18 : (xd->bd == VPX_BITS_10) ? 16 :
+      (xd->bd == VPX_BITS_12) ? 18
+                              : (xd->bd == VPX_BITS_10) ? 16 :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-                                                             14;
+                                                        14;
   // Keep value, range, and count as locals.  The compiler produces better
   // results with the locals than using r directly.
   BD_VALUE value = r->value;
@@ -170,7 +171,12 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
                   read_coeff(r, vp9_cat1_prob, 1, &value, &count, &range);
           }
         }
+#if CONFIG_VP9_HIGHBITDEPTH
+        // val may use 18-bits
+        v = (int)(((int64_t)val * dqv) >> dq_shift);
+#else
         v = (val * dqv) >> dq_shift;
+#endif
       } else {
         if (read_bool(r, p[1], &value, &count, &range)) {
           token_cache[scan[c]] = 3;
@@ -188,9 +194,8 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
     }
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
 #if CONFIG_VP9_HIGHBITDEPTH
-    dqcoeff[scan[c]] =
-        highbd_check_range(read_bool(r, 128, &value, &count, &range) ? -v : v),
-                           xd->bd);
+    dqcoeff[scan[c]] = highbd_check_range(
+        read_bool(r, 128, &value, &count, &range) ? -v : v, xd->bd);
 #else
     dqcoeff[scan[c]] =
         check_range(read_bool(r, 128, &value, &count, &range) ? -v : v);
diff --git a/libs/libvpx/vp9/decoder/vp9_dthread.c b/libs/libvpx/vp9/decoder/vp9_dthread.c
deleted file mode 100644
index 52bc2a0f60..0000000000
--- a/libs/libvpx/vp9/decoder/vp9_dthread.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_reconinter.h"
-#include "vp9/decoder/vp9_dthread.h"
-#include "vp9/decoder/vp9_decoder.h"
-
-// #define DEBUG_THREAD
-
-// TODO(hkuang): Clean up all the #ifdef in this file.
-void vp9_frameworker_lock_stats(VPxWorker *const worker) {
-#if CONFIG_MULTITHREAD
-  FrameWorkerData *const worker_data = worker->data1;
-  pthread_mutex_lock(&worker_data->stats_mutex);
-#else
-  (void)worker;
-#endif
-}
-
-void vp9_frameworker_unlock_stats(VPxWorker *const worker) {
-#if CONFIG_MULTITHREAD
-  FrameWorkerData *const worker_data = worker->data1;
-  pthread_mutex_unlock(&worker_data->stats_mutex);
-#else
-  (void)worker;
-#endif
-}
-
-void vp9_frameworker_signal_stats(VPxWorker *const worker) {
-#if CONFIG_MULTITHREAD
-  FrameWorkerData *const worker_data = worker->data1;
-
-// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper.
-#if defined(_WIN32) && !HAVE_PTHREAD_H
-  pthread_cond_signal(&worker_data->stats_cond);
-#else
-  pthread_cond_broadcast(&worker_data->stats_cond);
-#endif
-
-#else
-  (void)worker;
-#endif
-}
-
-// This macro prevents thread_sanitizer from reporting known concurrent writes.
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#define BUILDING_WITH_TSAN
-#endif
-#endif
-
-// TODO(hkuang): Remove worker parameter as it is only used in debug code.
-void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
-                          int row) {
-#if CONFIG_MULTITHREAD
-  if (!ref_buf) return;
-
-#ifndef BUILDING_WITH_TSAN
-  // The following line of code will get harmless tsan error but it is the key
-  // to get best performance.
-  if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
-#endif
-
-  {
-    // Find the worker thread that owns the reference frame. If the reference
-    // frame has been fully decoded, it may not have owner.
-    VPxWorker *const ref_worker = ref_buf->frame_worker_owner;
-    FrameWorkerData *const ref_worker_data =
-        (FrameWorkerData *)ref_worker->data1;
-    const VP9Decoder *const pbi = ref_worker_data->pbi;
-
-#ifdef DEBUG_THREAD
-    {
-      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-      printf("%d %p worker is waiting for %d %p worker (%d)  ref %d \r\n",
-             worker_data->worker_id, worker, ref_worker_data->worker_id,
-             ref_buf->frame_worker_owner, row, ref_buf->row);
-    }
-#endif
-
-    vp9_frameworker_lock_stats(ref_worker);
-    while (ref_buf->row < row && pbi->cur_buf == ref_buf &&
-           ref_buf->buf.corrupted != 1) {
-      pthread_cond_wait(&ref_worker_data->stats_cond,
-                        &ref_worker_data->stats_mutex);
-    }
-
-    if (ref_buf->buf.corrupted == 1) {
-      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-      vp9_frameworker_unlock_stats(ref_worker);
-      vpx_internal_error(&worker_data->pbi->common.error,
-                         VPX_CODEC_CORRUPT_FRAME,
-                         "Worker %p failed to decode frame", worker);
-    }
-    vp9_frameworker_unlock_stats(ref_worker);
-  }
-#else
-  (void)worker;
-  (void)ref_buf;
-  (void)row;
-  (void)ref_buf;
-#endif  // CONFIG_MULTITHREAD
-}
-
-void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) {
-#if CONFIG_MULTITHREAD
-  VPxWorker *worker = buf->frame_worker_owner;
-
-#ifdef DEBUG_THREAD
-  {
-    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-    printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
-           buf->frame_worker_owner, row);
-  }
-#endif
-
-  vp9_frameworker_lock_stats(worker);
-  buf->row = row;
-  vp9_frameworker_signal_stats(worker);
-  vp9_frameworker_unlock_stats(worker);
-#else
-  (void)buf;
-  (void)row;
-#endif  // CONFIG_MULTITHREAD
-}
-
-void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
-                                  VPxWorker *const src_worker) {
-#if CONFIG_MULTITHREAD
-  FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
-  FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
-  VP9_COMMON *const src_cm = &src_worker_data->pbi->common;
-  VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common;
-  int i;
-
-  // Wait until source frame's context is ready.
-  vp9_frameworker_lock_stats(src_worker);
-  while (!src_worker_data->frame_context_ready) {
-    pthread_cond_wait(&src_worker_data->stats_cond,
-                      &src_worker_data->stats_mutex);
-  }
-
-  dst_cm->last_frame_seg_map = src_cm->seg.enabled
-                                   ? src_cm->current_frame_seg_map
-                                   : src_cm->last_frame_seg_map;
-  dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
-  vp9_frameworker_unlock_stats(src_worker);
-
-  dst_cm->bit_depth = src_cm->bit_depth;
-#if CONFIG_VP9_HIGHBITDEPTH
-  dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
-#endif
-  dst_cm->prev_frame =
-      src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame;
-  dst_cm->last_width =
-      !src_cm->show_existing_frame ? src_cm->width : src_cm->last_width;
-  dst_cm->last_height =
-      !src_cm->show_existing_frame ? src_cm->height : src_cm->last_height;
-  dst_cm->subsampling_x = src_cm->subsampling_x;
-  dst_cm->subsampling_y = src_cm->subsampling_y;
-  dst_cm->frame_type = src_cm->frame_type;
-  dst_cm->last_show_frame = !src_cm->show_existing_frame
-                                ? src_cm->show_frame
-                                : src_cm->last_show_frame;
-  for (i = 0; i < REF_FRAMES; ++i)
-    dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
-
-  memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
-         (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
-  dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
-  dst_cm->lf.filter_level = src_cm->lf.filter_level;
-  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS);
-  memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
-  dst_cm->seg = src_cm->seg;
-  memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
-         FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
-#else
-  (void)dst_worker;
-  (void)src_worker;
-#endif  // CONFIG_MULTITHREAD
-}
diff --git a/libs/libvpx/vp9/decoder/vp9_dthread.h b/libs/libvpx/vp9/decoder/vp9_dthread.h
deleted file mode 100644
index fce0fe7fe3..0000000000
--- a/libs/libvpx/vp9/decoder/vp9_dthread.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_DECODER_VP9_DTHREAD_H_
-#define VP9_DECODER_VP9_DTHREAD_H_
-
-#include "./vpx_config.h"
-#include "vpx_util/vpx_thread.h"
-#include "vpx/internal/vpx_codec_internal.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct VP9Common;
-struct VP9Decoder;
-
-// WorkerData for the FrameWorker thread. It contains all the information of
-// the worker and decode structures for decoding a frame.
-typedef struct FrameWorkerData {
-  struct VP9Decoder *pbi;
-  const uint8_t *data;
-  const uint8_t *data_end;
-  size_t data_size;
-  void *user_priv;
-  int result;
-  int worker_id;
-  int received_frame;
-
-  // scratch_buffer is used in frame parallel mode only.
-  // It is used to make a copy of the compressed data.
-  uint8_t *scratch_buffer;
-  size_t scratch_buffer_size;
-
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t stats_mutex;
-  pthread_cond_t stats_cond;
-#endif
-
-  int frame_context_ready;  // Current frame's context is ready to read.
-  int frame_decoded;        // Finished decoding current frame.
-} FrameWorkerData;
-
-void vp9_frameworker_lock_stats(VPxWorker *const worker);
-void vp9_frameworker_unlock_stats(VPxWorker *const worker);
-void vp9_frameworker_signal_stats(VPxWorker *const worker);
-
-// Wait until ref_buf has been decoded to row in real pixel unit.
-// Note: worker may already finish decoding ref_buf and release it in order to
-// start decoding next frame. So need to check whether worker is still decoding
-// ref_buf.
-void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
-                          int row);
-
-// FrameWorker broadcasts its decoding progress so other workers that are
-// waiting on it can resume decoding.
-void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row);
-
-// Copy necessary decoding context from src worker to dst worker.
-void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
-                                  VPxWorker *const src_worker);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_DECODER_VP9_DTHREAD_H_
diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
index afffc77178..513718e7cb 100644
--- a/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -16,20 +16,20 @@
 
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 void vp9_fdct8x8_quant_neon(const int16_t *input, int stride,
-                            int16_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *zbin_ptr,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                            int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                            tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t *round_ptr,
+                            const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                             uint16_t *eob_ptr, const int16_t *scan_ptr,
                             const int16_t *iscan_ptr) {
-  int16_t temp_buffer[64];
+  tran_low_t temp_buffer[64];
   (void)coeff_ptr;
 
   vpx_fdct8x8_neon(input, temp_buffer, stride);
-  vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr,
-                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                       dequant_ptr, eob_ptr, scan_ptr, iscan_ptr);
+  vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, round_ptr, quant_ptr,
+                       qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan_ptr,
+                       iscan_ptr);
 }
diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c
new file mode 100644
index 0000000000..4152e7bb5d
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c
@@ -0,0 +1,352 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+#include "vpx_mem/vpx_mem.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+  const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
+  const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+  const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
+  const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210),
+                                vget_low_s64(fedcba98_76543210));
+  const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
+  return sum_diff;
+}
+
+// Denoise a 16x1 vector.
+static INLINE int8x16_t denoiser_16x1_neon(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold,
+    const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment,
+    const uint8x16_t v_delta_level_1_and_2,
+    const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) {
+  const uint8x16_t v_sig = vld1q_u8(sig);
+  const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+  /* Calculate absolute difference and sign masks. */
+  const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+
+  /* Figure out which level that put us in. */
+  const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff);
+  const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff);
+  const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff);
+
+  /* Calculate absolute adjustments for level 1, 2 and 3. */
+  const uint8x16_t v_level2_adjustment =
+      vandq_u8(v_level2_mask, v_delta_level_1_and_2);
+  const uint8x16_t v_level3_adjustment =
+      vandq_u8(v_level3_mask, v_delta_level_2_and_3);
+  const uint8x16_t v_level1and2_adjustment =
+      vaddq_u8(v_level1_adjustment, v_level2_adjustment);
+  const uint8x16_t v_level1and2and3_adjustment =
+      vaddq_u8(v_level1and2_adjustment, v_level3_adjustment);
+
+  /* Figure adjustment absolute value by selecting between the absolute
+   * difference if in level0 or the value for level 1, 2 and 3.
+   */
+  const uint8x16_t v_abs_adjustment =
+      vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff);
+
+  /* Calculate positive and negative adjustments. Apply them to the signal
+   * and accumulate them. Adjustments are less than eight and the maximum
+   * sum of them (7 * 16) can fit in a signed char.
+   */
+  const uint8x16_t v_pos_adjustment =
+      vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+  const uint8x16_t v_neg_adjustment =
+      vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+  uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
+  v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
+
+  /* Store results. */
+  vst1q_u8(running_avg_y, v_running_avg_y);
+
+  /* Sum all the accumulators to have the sum of all pixel differences
+   * for this macroblock.
+   */
+  {
+    const int8x16_t v_sum_diff =
+        vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+                  vreinterpretq_s8_u8(v_neg_adjustment));
+    v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+  }
+  return v_sum_diff_total;
+}
+
+static INLINE int8x16_t denoiser_adjust_16x1_neon(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const uint8x16_t k_delta, int8x16_t v_sum_diff_total) {
+  uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
+  const uint8x16_t v_sig = vld1q_u8(sig);
+  const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+  /* Calculate absolute difference and sign masks. */
+  const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+  // Clamp absolute difference to delta to get the adjustment.
+  const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
+
+  const uint8x16_t v_pos_adjustment =
+      vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+  const uint8x16_t v_neg_adjustment =
+      vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+  v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
+  v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
+
+  /* Store results. */
+  vst1q_u8(running_avg_y, v_running_avg_y);
+
+  {
+    const int8x16_t v_sum_diff =
+        vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+                  vreinterpretq_s8_u8(v_pos_adjustment));
+    v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+  }
+  return v_sum_diff_total;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int vp9_denoiser_8xN_neon(const uint8_t *sig, int sig_stride,
+                                 const uint8_t *mc_running_avg_y,
+                                 int mc_avg_y_stride, uint8_t *running_avg_y,
+                                 int avg_y_stride, int increase_denoising,
+                                 BLOCK_SIZE bs, int motion_magnitude,
+                                 int width) {
+  int sum_diff_thresh, r, sum_diff = 0;
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+
+  const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+  const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+  const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+  const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc);
+  const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+  const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+  const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
+
+  int8x16_t v_sum_diff_total = vdupq_n_s8(0);
+
+  for (r = 0; r < b_height; ++r) {
+    memcpy(sig_buffer[r], sig, width);
+    memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+    memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+    memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+           width);
+    memcpy(running_buffer[r], running_avg_y, width);
+    memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+    v_sum_diff_total = denoiser_16x1_neon(
+        sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+        v_level1_threshold, v_level2_threshold, v_level3_threshold,
+        v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3,
+        v_sum_diff_total);
+    {
+      const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+      const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer);
+      const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer);
+      vst1_u8(running_avg_y, v_running_buffer_low);
+      vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+    }
+    // Update pointers for next iteration.
+    sig += (sig_stride << 1);
+    mc_running_avg_y += (mc_avg_y_stride << 1);
+    running_avg_y += (avg_y_stride << 1);
+  }
+
+  {
+    sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // check if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the acceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const uint8x16_t k_delta = vmovq_n_u8(delta);
+        running_avg_y -= avg_y_stride * (b_height << 1);
+        for (r = 0; r < b_height; ++r) {
+          v_sum_diff_total = denoiser_adjust_16x1_neon(
+              sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta,
+              v_sum_diff_total);
+          {
+            const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+            const uint8x8_t v_running_buffer_high =
+                vget_high_u8(v_running_buffer);
+            const uint8x8_t v_running_buffer_low =
+                vget_low_u8(v_running_buffer);
+            vst1_u8(running_avg_y, v_running_buffer_low);
+            vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+          }
+          // Update pointers for next iteration.
+          running_avg_y += (avg_y_stride << 1);
+        }
+        sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+
+  return FILTER_BLOCK;
+}
+
+// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
+static int vp9_denoiser_NxM_neon(const uint8_t *sig, int sig_stride,
+                                 const uint8_t *mc_running_avg_y,
+                                 int mc_avg_y_stride, uint8_t *running_avg_y,
+                                 int avg_y_stride, int increase_denoising,
+                                 BLOCK_SIZE bs, int motion_magnitude) {
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+  const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+  const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+  const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
+  const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+  const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+  const int b_width = (4 << b_width_log2_lookup[bs]);
+  const int b_height = (4 << b_height_log2_lookup[bs]);
+  const int b_width_shift4 = b_width >> 4;
+
+  int8x16_t v_sum_diff_total[4][4];
+  int r, c, sum_diff = 0;
+
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      v_sum_diff_total[c][r] = vdupq_n_s8(0);
+    }
+  }
+
+  for (r = 0; r < b_height; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon(
+          sig, mc_running_avg_y, running_avg_y, v_level1_threshold,
+          v_level2_threshold, v_level3_threshold, v_level1_adjustment,
+          v_delta_level_1_and_2, v_delta_level_2_and_3,
+          v_sum_diff_total[c][r >> 4]);
+
+      // Update pointers for next iteration.
+      sig += 16;
+      mc_running_avg_y += 16;
+      running_avg_y += 16;
+    }
+
+    if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < b_width_shift4; ++c) {
+        sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+      }
+    }
+
+    // Update pointers for next iteration.
+    sig = sig - b_width + sig_stride;
+    mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+    running_avg_y = running_avg_y - b_width + avg_y_stride;
+  }
+
+  {
+    const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const uint8x16_t k_delta = vdupq_n_u8(delta);
+        sig -= sig_stride * b_height;
+        mc_running_avg_y -= mc_avg_y_stride * b_height;
+        running_avg_y -= avg_y_stride * b_height;
+        sum_diff = 0;
+
+        for (r = 0; r < b_height; ++r) {
+          for (c = 0; c < b_width_shift4; ++c) {
+            v_sum_diff_total[c][r >> 4] =
+                denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y,
+                                          k_delta, v_sum_diff_total[c][r >> 4]);
+
+            // Update pointers for next iteration.
+            sig += 16;
+            mc_running_avg_y += 16;
+            running_avg_y += 16;
+          }
+          if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < b_width_shift4; ++c) {
+              sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+            }
+          }
+
+          sig = sig - b_width + sig_stride;
+          mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+          running_avg_y = running_avg_y - b_width + avg_y_stride;
+        }
+
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride,
+                             const uint8_t *mc_avg, int mc_avg_stride,
+                             uint8_t *avg, int avg_stride,
+                             int increase_denoising, BLOCK_SIZE bs,
+                             int motion_magnitude) {
+  // Rank by frequency of the block type to have an early termination.
+  if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+      bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+      bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+    return vp9_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+                                 avg_stride, increase_denoising, bs,
+                                 motion_magnitude);
+  } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+    return vp9_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+                                 avg_stride, increase_denoising, bs,
+                                 motion_magnitude, 8);
+  }
+  return COPY_BLOCK;
+}
diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
new file mode 100644
index 0000000000..e46f789bac
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
@@ -0,0 +1,843 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_scale/yv12config.h"
+
+// Note: The scaling functions could write extra rows and columns in dst, which
+// exceed the right and bottom boundaries of the destination frame. We rely on
+// the following frame extension function to fix these rows and columns.
+
+static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
+                                              const int src_stride,
+                                              uint8_t *dst,
+                                              const int dst_stride, const int w,
+                                              const int h) {
+  const int max_width = (w + 15) & ~15;
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      const uint8x16x2_t s = vld2q_u8(src);
+      vst1q_u8(dst, s.val[0]);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src,
+                                              const int src_stride,
+                                              uint8_t *dst,
+                                              const int dst_stride, const int w,
+                                              const int h) {
+  const int max_width = (w + 15) & ~15;
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      const uint8x16x4_t s = vld4q_u8(src);
+      vst1q_u8(dst, s.val[0]);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_bilinear_kernel(
+    const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2,
+    const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1,
+    uint8_t *const dst) {
+  const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0);
+  const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0);
+  const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0);
+  const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0);
+  const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1);
+  const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1);
+  const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1);
+  const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1);
+
+  const uint8x8_t hor0 = vrshrn_n_u16(h4, 7);  // temp: 00 01 02 03 04 05 06 07
+  const uint8x8_t hor1 = vrshrn_n_u16(h5, 7);  // temp: 08 09 0A 0B 0C 0D 0E 0F
+  const uint8x8_t hor2 = vrshrn_n_u16(h6, 7);  // temp: 10 11 12 13 14 15 16 17
+  const uint8x8_t hor3 = vrshrn_n_u16(h7, 7);  // temp: 18 19 1A 1B 1C 1D 1E 1F
+  const uint16x8_t v0 = vmull_u8(hor0, coef0);
+  const uint16x8_t v1 = vmull_u8(hor1, coef0);
+  const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1);
+  const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1);
+  // dst: 0 1 2 3 4 5 6 7  8 9 A B C D E F
+  const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7));
+  vst1q_u8(dst, d);
+}
+
+static INLINE void scale_plane_2_to_1_bilinear(
+    const uint8_t *const src, const int src_stride, uint8_t *dst,
+    const int dst_stride, const int w, const int h, const int16_t c0,
+    const int16_t c1) {
+  const int max_width = (w + 15) & ~15;
+  const uint8_t *src0 = src;
+  const uint8_t *src1 = src + src_stride;
+  const uint8x8_t coef0 = vdup_n_u8(c0);
+  const uint8x8_t coef1 = vdup_n_u8(c1);
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      // 000 002 004 006 008 00A 00C 00E  010 012 014 016 018 01A 01C 01E
+      // 001 003 005 007 009 00B 00D 00F  011 013 015 017 019 01B 01D 01F
+      const uint8x16x2_t s0 = vld2q_u8(src0);
+      // 100 102 104 106 108 10A 10C 10E  110 112 114 116 118 11A 11C 11E
+      // 101 103 105 107 109 10B 10D 10F  111 113 115 117 119 11B 11D 11F
+      const uint8x16x2_t s1 = vld2q_u8(src1);
+      scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+                                  coef0, coef1, dst);
+      src0 += 32;
+      src1 += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src0 += 2 * (src_stride - max_width);
+    src1 += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_bilinear(
+    const uint8_t *const src, const int src_stride, uint8_t *dst,
+    const int dst_stride, const int w, const int h, const int16_t c0,
+    const int16_t c1) {
+  const int max_width = (w + 15) & ~15;
+  const uint8_t *src0 = src;
+  const uint8_t *src1 = src + src_stride;
+  const uint8x8_t coef0 = vdup_n_u8(c0);
+  const uint8x8_t coef1 = vdup_n_u8(c1);
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      // (*) -- useless
+      // 000 004 008 00C 010 014 018 01C  020 024 028 02C 030 034 038 03C
+      // 001 005 009 00D 011 015 019 01D  021 025 029 02D 031 035 039 03D
+      // 002 006 00A 00E 012 016 01A 01E  022 026 02A 02E 032 036 03A 03E (*)
+      // 003 007 00B 00F 013 017 01B 01F  023 027 02B 02F 033 037 03B 03F (*)
+      const uint8x16x4_t s0 = vld4q_u8(src0);
+      // 100 104 108 10C 110 114 118 11C  120 124 128 12C 130 134 138 13C
+      // 101 105 109 10D 111 115 119 11D  121 125 129 12D 131 135 139 13D
+      // 102 106 10A 10E 112 116 11A 11E  122 126 12A 12E 132 136 13A 13E (*)
+      // 103 107 10B 10F 113 117 11B 11F  123 127 12B 12F 133 137 13B 13F (*)
+      const uint8x16x4_t s1 = vld4q_u8(src1);
+      scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+                                  coef0, coef1, dst);
+      src0 += 64;
+      src1 += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src0 += 4 * (src_stride - max_width);
+    src1 += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s,
+                                              const uint8x8_t *const coef) {
+  const uint16x8_t h0 = vmull_u8(s[0], coef[0]);
+  const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]);
+
+  return vrshrn_n_u16(h1, 7);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 3) & ~3;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 3) & ~3;
+  const int16x8_t filters = vld1q_s16(coef);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[14], d[4];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+  // horizontal 4x8
+  // Note: processing 4x8 is about 20% faster than processing row by row using
+  // vld4_u8().
+  do {
+    load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+                  &s[12], &s[13]);
+      transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                       &s[13]);
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 10 20 30 40 50 60 70
+      d[1] = scale_filter_8(&s[2], filters);  // 01 11 21 31 41 51 61 71
+      d[2] = scale_filter_8(&s[4], filters);  // 02 12 22 32 42 52 62 72
+      d[3] = scale_filter_8(&s[6], filters);  // 03 13 23 33 43 53 63 73
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]);
+      vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]),
+                    1);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+
+      t += 4;
+      x -= 4;
+    } while (x);
+    src += 8 * src_stride - 2 * width_hor;
+    t += 7 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x4
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += 6 * width_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+                  &s[12], &s[13]);
+      t += 8 * width_hor;
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 01 02 03 04 05 06 07
+      d[1] = scale_filter_8(&s[2], filters);  // 10 11 12 13 14 15 16 17
+      d[2] = scale_filter_8(&s[4], filters);  // 20 21 22 23 24 25 26 27
+      d[3] = scale_filter_8(&s[6], filters);  // 30 31 32 33 34 35 36 37
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+
+      dst += 4 * dst_stride;
+      y -= 4;
+    } while (y);
+    t -= width_hor * (2 * height_ver + 6);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 1) & ~1;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 1) & ~1;
+  const int16x8_t filters = vld1q_s16(coef);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[12], d[2];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+  // horizontal 2x8
+  // Note: processing 2x8 is about 20% faster than processing row by row using
+  // vld4_u8().
+  do {
+    load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]);
+    x = width_hor;
+
+    do {
+      uint8x8x2_t dd;
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+                  &s[10], &s[11]);
+      transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10],
+                       &s[11]);
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 10 20 30 40 50 60 70
+      d[1] = scale_filter_8(&s[4], filters);  // 01 11 21 31 41 51 61 71
+      // dd.val[0]: 00 01 20 21 40 41 60 61
+      // dd.val[1]: 10 11 30 31 50 51 70 71
+      dd = vtrn_u8(d[0], d[1]);
+      vst1_lane_u16((uint16_t *)(t + 0 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 0);
+      vst1_lane_u16((uint16_t *)(t + 1 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 0);
+      vst1_lane_u16((uint16_t *)(t + 2 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 1);
+      vst1_lane_u16((uint16_t *)(t + 3 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 1);
+      vst1_lane_u16((uint16_t *)(t + 4 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 2);
+      vst1_lane_u16((uint16_t *)(t + 5 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 2);
+      vst1_lane_u16((uint16_t *)(t + 6 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 3);
+      vst1_lane_u16((uint16_t *)(t + 7 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 3);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+
+      t += 2;
+      x -= 2;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor;
+    t += 7 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x2
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]);
+    t += 4 * width_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+                  &s[10], &s[11]);
+      t += 8 * width_hor;
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 01 02 03 04 05 06 07
+      d[1] = scale_filter_8(&s[4], filters);  // 10 11 12 13 14 15 16 17
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+    t -= width_hor * (4 * height_ver + 4);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+// Notes for 4 to 3 scaling:
+//
+// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be
+// multiple of 6, and no less than w.
+//
+// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be
+// multiple of 8, and no less than w.
+//
+// 3. 8 columns are calculated in each horizontal inner loop for further
+// vertical scaling, so height_hor must be multiple of 8, and no less than
+// 4 * h / 3.
+//
+// 4. 6 columns are calculated in each vertical inner loop, so height_ver must
+// be multiple of 6, and no less than h.
+//
+// 5. The physical location of the last row of the 4 to 3 scaled frame is
+// decided by phase_scaler, and are always less than 1 pixel below the last row
+// of the original image.
+
+static void scale_plane_4_to_3_bilinear(const uint8_t *src,
+                                        const int src_stride, uint8_t *dst,
+                                        const int dst_stride, const int w,
+                                        const int h, const int phase_scaler,
+                                        uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = width_hor + 2;  // store 2 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We only need 1 extra row below because there are only 2 bilinear
+  // coefficients.
+  const int height_hor = (4 * h / 3 + 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[9], d[8], c[6];
+
+  assert(w && h);
+
+  c[0] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][3]);
+  c[1] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][4]);
+  c[2] = vdup_n_u8(
+      (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) &
+                                            SUBPEL_MASK][3]);
+  c[3] = vdup_n_u8(
+      (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) &
+                                            SUBPEL_MASK][4]);
+  c[4] = vdup_n_u8(
+      (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) &
+                                            SUBPEL_MASK][3]);
+  c[5] = vdup_n_u8(
+      (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) &
+                                            SUBPEL_MASK][4]);
+
+  d[6] = vdup_n_u8(0);
+  d[7] = vdup_n_u8(0);
+
+  // horizontal 6x8
+  do {
+    load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    src += 1;
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                  &s[7], &s[8]);
+      src += 8;
+      transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = scale_filter_bilinear(&s[0], &c[0]);
+      d[1] =
+          scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+      d[2] =
+          scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+      d[3] = scale_filter_bilinear(&s[4], &c[0]);
+      d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+                                   &c[2]);
+      d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+                                   &c[4]);
+
+      // 00 01 02 03 04 05 xx xx
+      // 10 11 12 13 14 15 xx xx
+      // 20 21 22 23 24 25 xx xx
+      // 30 31 32 33 34 35 xx xx
+      // 40 41 42 43 44 45 xx xx
+      // 50 51 52 53 54 55 xx xx
+      // 60 61 62 63 64 65 xx xx
+      // 70 71 72 73 74 75 xx xx
+      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      // store 2 extra pixels
+      vst1_u8(t + 0 * stride_hor, d[0]);
+      vst1_u8(t + 1 * stride_hor, d[1]);
+      vst1_u8(t + 2 * stride_hor, d[2]);
+      vst1_u8(t + 3 * stride_hor, d[3]);
+      vst1_u8(t + 4 * stride_hor, d[4]);
+      vst1_u8(t + 5 * stride_hor, d[5]);
+      vst1_u8(t + 6 * stride_hor, d[6]);
+      vst1_u8(t + 7 * stride_hor, d[7]);
+
+      s[0] = s[8];
+
+      t += 6;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3 - 1;
+    t += 7 * stride_hor + 2;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += stride_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                  &s[7], &s[8]);
+      t += 8 * stride_hor;
+
+      d[0] = scale_filter_bilinear(&s[0], &c[0]);
+      d[1] =
+          scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+      d[2] =
+          scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+      d[3] = scale_filter_bilinear(&s[4], &c[0]);
+      d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+                                   &c[2]);
+      d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+                                   &c[4]);
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+      vst1_u8(dst + 4 * dst_stride, d[4]);
+      vst1_u8(dst + 5 * dst_stride, d[5]);
+
+      s[0] = s[8];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * (4 * height_ver / 3 + 1);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const InterpKernel *const coef,
+                                       const int phase_scaler,
+                                       uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = width_hor + 2;  // store 2 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+  // above and (SUBPEL_TAPS / 2) extra rows below.
+  const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  const int16x8_t filters0 =
+      vld1q_s16(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]);
+  const int16x8_t filters1 =
+      vld1q_s16(coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]);
+  const int16x8_t filters2 =
+      vld1q_s16(coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[15], d[8];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2;
+  d[6] = vdup_n_u8(0);
+  d[7] = vdup_n_u8(0);
+
+  // horizontal 6x8
+  do {
+    load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                  &s[13], &s[14]);
+      transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13],
+                       &s[14]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = scale_filter_8(&s[0], filters0);
+      d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+      d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+      d[3] = scale_filter_8(&s[4], filters0);
+      d[4] =
+          scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+      d[5] =
+          scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+
+      // 00 01 02 03 04 05 xx xx
+      // 10 11 12 13 14 15 xx xx
+      // 20 21 22 23 24 25 xx xx
+      // 30 31 32 33 34 35 xx xx
+      // 40 41 42 43 44 45 xx xx
+      // 50 51 52 53 54 55 xx xx
+      // 60 61 62 63 64 65 xx xx
+      // 70 71 72 73 74 75 xx xx
+      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      // store 2 extra pixels
+      vst1_u8(t + 0 * stride_hor, d[0]);
+      vst1_u8(t + 1 * stride_hor, d[1]);
+      vst1_u8(t + 2 * stride_hor, d[2]);
+      vst1_u8(t + 3 * stride_hor, d[3]);
+      vst1_u8(t + 4 * stride_hor, d[4]);
+      vst1_u8(t + 5 * stride_hor, d[5]);
+      vst1_u8(t + 6 * stride_hor, d[6]);
+      vst1_u8(t + 7 * stride_hor, d[7]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+      s[6] = s[14];
+
+      t += 6;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3;
+    t += 7 * stride_hor + 2;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += 7 * stride_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                  &s[13], &s[14]);
+      t += 8 * stride_hor;
+
+      d[0] = scale_filter_8(&s[0], filters0);
+      d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+      d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+      d[3] = scale_filter_8(&s[4], filters0);
+      d[4] =
+          scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+      d[5] =
+          scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+      vst1_u8(dst + 4 * dst_stride, d[4]);
+      vst1_u8(dst + 5 * dst_stride, d[5]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+      s[6] = s[14];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * (4 * height_ver / 3 + 7);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
+                                     YV12_BUFFER_CONFIG *dst,
+                                     INTERP_FILTER filter_type,
+                                     int phase_scaler) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  const int dst_uv_w = dst_w / 2;
+  const int dst_uv_h = dst_h / 2;
+  int scaled = 0;
+
+  // phase_scaler is usually 0 or 8.
+  assert(phase_scaler >= 0 && phase_scaler < 16);
+
+  if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+    // 2 to 1
+    scaled = 1;
+    if (phase_scaler == 0) {
+      scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0, c1);
+      scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+      scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+    } else {
+      const int buffer_stride = (dst_w + 3) & ~3;
+      const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height);
+      if (temp_buffer) {
+        scale_plane_2_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_2_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_2_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+    // 4 to 1
+    scaled = 1;
+    if (phase_scaler == 0) {
+      scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0, c1);
+      scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+      scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+    } else {
+      const int buffer_stride = (dst_w + 1) & ~1;
+      const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height);
+      if (temp_buffer) {
+        scale_plane_4_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_4_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_4_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+    // 4 to 3
+    const int buffer_stride = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
+    const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+    uint8_t *const temp_buffer =
+        (uint8_t *)malloc(buffer_stride * buffer_height);
+    if (temp_buffer) {
+      scaled = 1;
+      if (filter_type == BILINEAR) {
+        scale_plane_4_to_3_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                    dst->y_stride, dst_w, dst_h, phase_scaler,
+                                    temp_buffer);
+        scale_plane_4_to_3_bilinear(src->u_buffer, src->uv_stride,
+                                    dst->u_buffer, dst->uv_stride, dst_uv_w,
+                                    dst_uv_h, phase_scaler, temp_buffer);
+        scale_plane_4_to_3_bilinear(src->v_buffer, src->uv_stride,
+                                    dst->v_buffer, dst->uv_stride, dst_uv_w,
+                                    dst_uv_h, phase_scaler, temp_buffer);
+      } else {
+        scale_plane_4_to_3_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
+        scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                   dst->uv_stride, dst_uv_w, dst_uv_h,
+                                   vp9_filter_kernels[filter_type],
+                                   phase_scaler, temp_buffer);
+        scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                   dst->uv_stride, dst_uv_w, dst_uv_h,
+                                   vp9_filter_kernels[filter_type],
+                                   phase_scaler, temp_buffer);
+      }
+      free(temp_buffer);
+    }
+  }
+
+  if (scaled) {
+    vpx_extend_frame_borders(dst);
+  } else {
+    // Call c version for all other scaling ratios.
+    vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 33c2fc7feb..97a09bdff6 100644
--- a/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -9,9 +9,10 @@
  */
 
 #include <arm_neon.h>
-
+#include <assert.h>
 #include <math.h>
 
+#include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_quant_common.h"
@@ -21,97 +22,216 @@
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
 
-void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
-                          int skip_block, const int16_t *zbin_ptr,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
-                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
+                          int skip_block, const int16_t *round_ptr,
+                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                           uint16_t *eob_ptr, const int16_t *scan,
                           const int16_t *iscan) {
-  // TODO(jingning) Decide the need of these arguments after the
-  // quantization process is completed.
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  int i;
+  const int16x8_t v_zero = vdupq_n_s16(0);
+  const int16x8_t v_one = vdupq_n_s16(1);
+  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+  int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+  int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+  int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+
   (void)scan;
+  (void)skip_block;
+  assert(!skip_block);
 
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    int i;
-    const int16x8_t v_zero = vdupq_n_s16(0);
-    const int16x8_t v_one = vdupq_n_s16(1);
-    int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
-    int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
-    int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
-    int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
-    // adjust for dc
-    v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
-    v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
-    v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
-    // process dc and the first seven ac coeffs
-    {
-      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
-      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
-      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
-      const int32x4_t v_tmp_lo =
-          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-      const int32x4_t v_tmp_hi =
-          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-      const int16x8_t v_tmp2 =
-          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
-      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-      vst1q_s16(&qcoeff_ptr[0], v_qcoeff);
-      vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff);
-      v_round = vmovq_n_s16(round_ptr[1]);
-      v_quant = vmovq_n_s16(quant_ptr[1]);
-      v_dequant = vmovq_n_s16(dequant_ptr[1]);
-    }
-    // now process the rest of the ac coeffs
-    for (i = 8; i < count; i += 8) {
-      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
-      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
-      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
-      const int32x4_t v_tmp_lo =
-          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-      const int32x4_t v_tmp_hi =
-          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-      const int16x8_t v_tmp2 =
-          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
-      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
-      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
-    }
-    {
-      const int16x4_t v_eobmax_3210 = vmax_s16(
-          vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210));
-      const int64x1_t v_eobmax_xx32 =
-          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
-      const int16x4_t v_eobmax_tmp =
-          vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
-      const int64x1_t v_eobmax_xxx3 =
-          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
-      const int16x4_t v_eobmax_final =
-          vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+  // adjust for dc
+  v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+  v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+  v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+  // process dc and the first seven ac coeffs
+  {
+    const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+    const int32x4_t v_tmp_lo =
+        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+    const int32x4_t v_tmp_hi =
+        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+    const int16x8_t v_tmp2 =
+        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+    const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+    store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+    store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
+    v_round = vmovq_n_s16(round_ptr[1]);
+    v_quant = vmovq_n_s16(quant_ptr[1]);
+    v_dequant = vmovq_n_s16(dequant_ptr[1]);
+  }
+  // now process the rest of the ac coeffs
+  for (i = 8; i < count; i += 8) {
+    const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+    const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
+    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+    const int32x4_t v_tmp_lo =
+        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+    const int32x4_t v_tmp_hi =
+        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+    const int16x8_t v_tmp2 =
+        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+    const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+    store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
+    store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);
+  }
+  {
+    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
+                                             vget_high_s16(v_eobmax_76543210));
+    const int64x1_t v_eobmax_xx32 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+    const int16x4_t v_eobmax_tmp =
+        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+    const int64x1_t v_eobmax_xxx3 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+    const int16x4_t v_eobmax_final =
+        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
 
-      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-    }
-  } else {
-    memset(qcoeff_ptr, 0, count * sizeof(int16_t));
-    memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
-    *eob_ptr = 0;
+    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+  }
+}
+
+static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
+                                int skip_block, const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan_ptr) {
+  const int16x8_t one = vdupq_n_s16(1);
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+
+  // ROUND_POWER_OF_TWO(round_ptr[], 1)
+  const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+  const int16x8_t quant = vld1q_s16(quant_ptr);
+  const int16x4_t dequant = vld1_s16(dequant_ptr);
+  // dequant >> 2 is used similar to zbin as a threshold.
+  const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
+
+  // Process dc and the first seven ac coeffs.
+  const uint16x8_t iscan =
+      vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+  const int16x8_t coeff_abs = vabsq_s16(coeff);
+  const int16x8_t dequant_mask =
+      vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
+
+  int16x8_t qcoeff = vaddq_s16(coeff_abs, round);
+  int32x4_t dqcoeff_0, dqcoeff_1;
+  int16x8_t dqcoeff;
+  uint16x8_t eob_max;
+  (void)scan;
+  (void)count;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // coeff * quant_ptr[]) >> 15
+  qcoeff = vqdmulhq_s16(qcoeff, quant);
+
+  // Restore sign.
+  qcoeff = veorq_s16(qcoeff, coeff_sign);
+  qcoeff = vsubq_s16(qcoeff, coeff_sign);
+  qcoeff = vandq_s16(qcoeff, dequant_mask);
+
+  // qcoeff * dequant[] / 2
+  dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant);
+  dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
+
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+  dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+  dqcoeff = vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
+
+  eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+
+  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+  store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+
+  iscan_ptr += 8;
+  coeff_ptr += 8;
+  qcoeff_ptr += 8;
+  dqcoeff_ptr += 8;
+
+  {
+    int i;
+    const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1);
+    const int16x8_t quant = vmovq_n_s16(quant_ptr[1]);
+    const int16x8_t dequant_thresh =
+        vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2);
+
+    // Process the rest of the ac coeffs.
+    for (i = 8; i < 32 * 32; i += 8) {
+      const uint16x8_t iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+      const int16x8_t coeff_abs = vabsq_s16(coeff);
+      const int16x8_t dequant_mask =
+          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
+
+      int16x8_t qcoeff = vaddq_s16(coeff_abs, round);
+      int32x4_t dqcoeff_0, dqcoeff_1;
+      int16x8_t dqcoeff;
+
+      qcoeff = vqdmulhq_s16(qcoeff, quant);
+      qcoeff = veorq_s16(qcoeff, coeff_sign);
+      qcoeff = vsubq_s16(qcoeff, coeff_sign);
+      qcoeff = vandq_s16(qcoeff, dequant_mask);
+
+      dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]);
+      dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
+
+      dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+      dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+      dqcoeff =
+          vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
+
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+
+      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+      store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+
+      iscan_ptr += 8;
+      coeff_ptr += 8;
+      qcoeff_ptr += 8;
+      dqcoeff_ptr += 8;
+    }
+
+    {
+      const uint16x4_t eob_max_0 =
+          vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+      const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+      const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+      vst1_lane_u16(eob_ptr, eob_max_2, 0);
+    }
   }
 }
diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
deleted file mode 100644
index 23f7ebace4..0000000000
--- a/libs/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vpx_dsp/mips/macros_msa.h"
-
-static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride,
-                                            uint8_t *frm2_ptr, int32_t filt_sth,
-                                            int32_t filt_wgt, uint32_t *acc,
-                                            uint16_t *cnt) {
-  uint32_t row;
-  uint64_t f0, f1, f2, f3;
-  v16i8 frm2, frm1 = { 0 };
-  v16i8 frm4, frm3 = { 0 };
-  v16u8 frm_r, frm_l;
-  v8i16 frm2_r, frm2_l;
-  v8i16 diff0, diff1, mod0_h, mod1_h;
-  v4i32 cnst3, cnst16, filt_wt, strength;
-  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
-  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
-  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
-  v4i32 acc0, acc1, acc2, acc3;
-  v8i16 cnt0, cnt1;
-
-  filt_wt = __msa_fill_w(filt_wgt);
-  strength = __msa_fill_w(filt_sth);
-  cnst3 = __msa_ldi_w(3);
-  cnst16 = __msa_ldi_w(16);
-
-  for (row = 2; row--;) {
-    LD4(frm1_ptr, stride, f0, f1, f2, f3);
-    frm1_ptr += (4 * stride);
-
-    LD_SB2(frm2_ptr, 16, frm2, frm4);
-    frm2_ptr += 32;
-
-    LD_SW2(acc, 4, acc0, acc1);
-    LD_SW2(acc + 8, 4, acc2, acc3);
-    LD_SH2(cnt, 8, cnt0, cnt1);
-
-    INSERT_D2_SB(f0, f1, frm1);
-    INSERT_D2_SB(f2, f3, frm3);
-    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
-    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
-    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
-    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
-    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
-    diff0_r = (mod0_w < cnst16);
-    diff0_l = (mod1_w < cnst16);
-    diff1_r = (mod2_w < cnst16);
-    diff1_l = (mod3_w < cnst16);
-
-    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-
-    mod0_w = diff0_r & mod0_w;
-    mod1_w = diff0_l & mod1_w;
-    mod2_w = diff1_r & mod2_w;
-    mod3_w = diff1_l & mod3_w;
-
-    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
-    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
-    ST_SH2(mod0_h, mod1_h, cnt, 8);
-    cnt += 16;
-
-    UNPCK_UB_SH(frm2, frm2_r, frm2_l);
-    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
-    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
-    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
-         mod2_w, mod3_w);
-
-    ST_SW2(mod0_w, mod1_w, acc, 4);
-    acc += 8;
-    ST_SW2(mod2_w, mod3_w, acc, 4);
-    acc += 8;
-
-    LD_SW2(acc, 4, acc0, acc1);
-    LD_SW2(acc + 8, 4, acc2, acc3);
-    LD_SH2(cnt, 8, cnt0, cnt1);
-
-    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
-    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
-    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
-    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
-    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
-    diff0_r = (mod0_w < cnst16);
-    diff0_l = (mod1_w < cnst16);
-    diff1_r = (mod2_w < cnst16);
-    diff1_l = (mod3_w < cnst16);
-
-    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-
-    mod0_w = diff0_r & mod0_w;
-    mod1_w = diff0_l & mod1_w;
-    mod2_w = diff1_r & mod2_w;
-    mod3_w = diff1_l & mod3_w;
-
-    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
-    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
-    ST_SH2(mod0_h, mod1_h, cnt, 8);
-    cnt += 16;
-    UNPCK_UB_SH(frm4, frm2_r, frm2_l);
-    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
-    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
-    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
-         mod2_w, mod3_w);
-
-    ST_SW2(mod0_w, mod1_w, acc, 4);
-    acc += 8;
-    ST_SW2(mod2_w, mod3_w, acc, 4);
-    acc += 8;
-  }
-}
-
-static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride,
-                                             uint8_t *frm2_ptr,
-                                             int32_t filt_sth, int32_t filt_wgt,
-                                             uint32_t *acc, uint16_t *cnt) {
-  uint32_t row;
-  v16i8 frm1, frm2, frm3, frm4;
-  v16u8 frm_r, frm_l;
-  v16i8 zero = { 0 };
-  v8u16 frm2_r, frm2_l;
-  v8i16 diff0, diff1, mod0_h, mod1_h;
-  v4i32 cnst3, cnst16, filt_wt, strength;
-  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
-  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
-  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
-  v4i32 acc0, acc1, acc2, acc3;
-  v8i16 cnt0, cnt1;
-
-  filt_wt = __msa_fill_w(filt_wgt);
-  strength = __msa_fill_w(filt_sth);
-  cnst3 = __msa_ldi_w(3);
-  cnst16 = __msa_ldi_w(16);
-
-  for (row = 8; row--;) {
-    LD_SB2(frm1_ptr, stride, frm1, frm3);
-    frm1_ptr += stride;
-
-    LD_SB2(frm2_ptr, 16, frm2, frm4);
-    frm2_ptr += 16;
-
-    LD_SW2(acc, 4, acc0, acc1);
-    LD_SW2(acc, 4, acc2, acc3);
-    LD_SH2(cnt, 8, cnt0, cnt1);
-
-    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
-    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
-    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
-    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
-    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
-    diff0_r = (mod0_w < cnst16);
-    diff0_l = (mod1_w < cnst16);
-    diff1_r = (mod2_w < cnst16);
-    diff1_l = (mod3_w < cnst16);
-
-    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-
-    mod0_w = diff0_r & mod0_w;
-    mod1_w = diff0_l & mod1_w;
-    mod2_w = diff1_r & mod2_w;
-    mod3_w = diff1_l & mod3_w;
-
-    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
-    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
-    ST_SH2(mod0_h, mod1_h, cnt, 8);
-    cnt += 16;
-
-    ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
-    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
-    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
-    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
-         mod2_w, mod3_w);
-
-    ST_SW2(mod0_w, mod1_w, acc, 4);
-    acc += 8;
-    ST_SW2(mod2_w, mod3_w, acc, 4);
-    acc += 8;
-
-    LD_SW2(acc, 4, acc0, acc1);
-    LD_SW2(acc + 8, 4, acc2, acc3);
-    LD_SH2(cnt, 8, cnt0, cnt1);
-
-    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
-    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
-    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
-    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
-    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
-    diff0_r = (mod0_w < cnst16);
-    diff0_l = (mod1_w < cnst16);
-    diff1_r = (mod2_w < cnst16);
-    diff1_l = (mod3_w < cnst16);
-
-    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-
-    mod0_w = diff0_r & mod0_w;
-    mod1_w = diff0_l & mod1_w;
-    mod2_w = diff1_r & mod2_w;
-    mod3_w = diff1_l & mod3_w;
-
-    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
-    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
-    ST_SH2(mod0_h, mod1_h, cnt, 8);
-    cnt += 16;
-
-    ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
-    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
-    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
-    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
-         mod2_w, mod3_w);
-    ST_SW2(mod0_w, mod1_w, acc, 4);
-    acc += 8;
-    ST_SW2(mod2_w, mod3_w, acc, 4);
-    acc += 8;
-
-    frm1_ptr += stride;
-    frm2_ptr += 16;
-  }
-}
-
-void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
-                                   uint8_t *frame2_ptr, uint32_t blk_w,
-                                   uint32_t blk_h, int32_t strength,
-                                   int32_t filt_wgt, uint32_t *accu,
-                                   uint16_t *cnt) {
-  if (8 == (blk_w * blk_h)) {
-    temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
-                                    filt_wgt, accu, cnt);
-  } else if (16 == (blk_w * blk_h)) {
-    temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
-                                     filt_wgt, accu, cnt);
-  } else {
-    vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
-                                strength, filt_wgt, accu, cnt);
-  }
-}
diff --git a/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.c b/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.c
index 3aeefb5845..acc3764c7a 100644
--- a/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.c
+++ b/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.c
@@ -15,7 +15,7 @@ struct ALT_REF_AQ {
   int dummy;
 };
 
-struct ALT_REF_AQ *vp9_alt_ref_aq_create() {
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void) {
   return (struct ALT_REF_AQ *)vpx_malloc(sizeof(struct ALT_REF_AQ));
 }
 
diff --git a/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h b/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h
index 18acd8a85b..e508cb44ac 100644
--- a/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h
+++ b/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h
@@ -54,7 +54,7 @@ struct ALT_REF_AQ;
  *
  * \return Instance of the class
  */
-struct ALT_REF_AQ *vp9_alt_ref_aq_create();
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void);
 
 /*!\brief Upload segmentation_map to self object
  *
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index 072d92e4e9..2f2f0055a7 100644
--- a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -127,22 +127,16 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i,
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   int bits_per_mb;
-  int num8x8bl = cm->MBs << 2;
-  // Weight for segment prior to encoding: take the average of the target
-  // number for the frame to be encoded and the actual from the previous frame.
-  int target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
-  double weight_segment =
-      (double)((target_refresh + cr->actual_num_seg1_blocks +
-                cr->actual_num_seg2_blocks) >>
-               1) /
-      num8x8bl;
-  // Compute delta-q corresponding to qindex i.
-  int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+  int deltaq = 0;
+  if (cpi->oxcf.speed < 8)
+    deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+  else
+    deltaq = -(cr->max_qdelta_perc * i) / 200;
   // Take segment weighted average for bits per mb.
-  bits_per_mb = (int)((1.0 - weight_segment) *
+  bits_per_mb = (int)((1.0 - cr->weight_segment) *
                           vp9_rc_bits_per_mb(cm->frame_type, i,
                                              correction_factor, cm->bit_depth) +
-                      weight_segment *
+                      cr->weight_segment *
                           vp9_rc_bits_per_mb(cm->frame_type, i + deltaq,
                                              correction_factor, cm->bit_depth));
   return bits_per_mb;
@@ -246,24 +240,66 @@ void vp9_cyclic_refresh_update_sb_postencode(VP9_COMP *const cpi,
     }
 }
 
-// Update the actual number of blocks that were applied the segment delta q.
+// From the just encoded frame: update the actual number of blocks that were
+// applied the segment delta q, and the amount of low motion in the frame.
+// Also check conditions for forcing golden update, or preventing golden
+// update if the period is up.
 void vp9_cyclic_refresh_postencode(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO **mi = cm->mi_grid_visible;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  RATE_CONTROL *const rc = &cpi->rc;
   unsigned char *const seg_map = cpi->segmentation_map;
+  double fraction_low = 0.0;
+  int force_gf_refresh = 0;
+  int low_content_frame = 0;
   int mi_row, mi_col;
   cr->actual_num_seg1_blocks = 0;
   cr->actual_num_seg2_blocks = 0;
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++)
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
-      if (cyclic_refresh_segment_id(seg_map[mi_row * cm->mi_cols + mi_col]) ==
-          CR_SEGMENT_ID_BOOST1)
+      MV mv = mi[0]->mv[0].as_mv;
+      int map_index = mi_row * cm->mi_cols + mi_col;
+      if (cyclic_refresh_segment_id(seg_map[map_index]) == CR_SEGMENT_ID_BOOST1)
         cr->actual_num_seg1_blocks++;
-      else if (cyclic_refresh_segment_id(
-                   seg_map[mi_row * cm->mi_cols + mi_col]) ==
+      else if (cyclic_refresh_segment_id(seg_map[map_index]) ==
                CR_SEGMENT_ID_BOOST2)
         cr->actual_num_seg2_blocks++;
+      // Accumulate low_content_frame.
+      if (is_inter_block(mi[0]) && abs(mv.row) < 16 && abs(mv.col) < 16)
+        low_content_frame++;
+      mi++;
     }
+    mi += 8;
+  }
+  // Check for golden frame update: only for non-SVC and non-golden boost.
+  if (!cpi->use_svc && cpi->ext_refresh_frame_flags_pending == 0 &&
+      !cpi->oxcf.gf_cbr_boost_pct) {
+    // Force this frame as a golden update frame if this frame changes the
+    // resolution (resize_pending != 0).
+    if (cpi->resize_pending != 0) {
+      vp9_cyclic_refresh_set_golden_update(cpi);
+      rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+      if (rc->frames_till_gf_update_due > rc->frames_to_key)
+        rc->frames_till_gf_update_due = rc->frames_to_key;
+      cpi->refresh_golden_frame = 1;
+      force_gf_refresh = 1;
+    }
+    // Update average of low content/motion in the frame.
+    fraction_low = (double)low_content_frame / (cm->mi_rows * cm->mi_cols);
+    cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
+    if (!force_gf_refresh && cpi->refresh_golden_frame == 1 &&
+        rc->frames_since_key > rc->frames_since_golden + 1) {
+      // Don't update golden reference if the amount of low_content for the
+      // current encoded frame is small, or if the recursive average of the
+      // low_content over the update interval window falls below threshold.
+      if (fraction_low < 0.65 || cr->low_content_avg < 0.6) {
+        cpi->refresh_golden_frame = 0;
+      }
+      // Reset for next internal.
+      cr->low_content_avg = fraction_low;
+    }
+  }
 }
 
 // Set golden frame update interval, for non-svc 1 pass CBR mode.
@@ -278,72 +314,8 @@ void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) {
   else
     rc->baseline_gf_interval = 40;
   if (cpi->oxcf.rc_mode == VPX_VBR) rc->baseline_gf_interval = 20;
-}
-
-// Update some encoding stats (from the just encoded frame). If this frame's
-// background has high motion, refresh the golden frame. Otherwise, if the
-// golden reference is to be updated check if we should NOT update the golden
-// ref.
-void vp9_cyclic_refresh_check_golden_update(VP9_COMP *const cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  int mi_row, mi_col;
-  double fraction_low = 0.0;
-  int low_content_frame = 0;
-  MODE_INFO **mi = cm->mi_grid_visible;
-  RATE_CONTROL *const rc = &cpi->rc;
-  const int rows = cm->mi_rows, cols = cm->mi_cols;
-  int cnt1 = 0, cnt2 = 0;
-  int force_gf_refresh = 0;
-  int flag_force_gf_high_motion = 0;
-  for (mi_row = 0; mi_row < rows; mi_row++) {
-    for (mi_col = 0; mi_col < cols; mi_col++) {
-      if (flag_force_gf_high_motion == 1) {
-        int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0
-                              ? mi[0]->mv[0].as_mv.row
-                              : -1 * mi[0]->mv[0].as_mv.row;
-        int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0
-                              ? mi[0]->mv[0].as_mv.col
-                              : -1 * mi[0]->mv[0].as_mv.col;
-        // Calculate the motion of the background.
-        if (abs_mvr <= 16 && abs_mvc <= 16) {
-          cnt1++;
-          if (abs_mvr == 0 && abs_mvc == 0) cnt2++;
-        }
-      }
-      mi++;
-      // Accumulate low_content_frame.
-      if (cr->map[mi_row * cols + mi_col] < 1) low_content_frame++;
-    }
-    mi += 8;
-  }
-  // For video conference clips, if the background has high motion in current
-  // frame because of the camera movement, set this frame as the golden frame.
-  // Use 70% and 5% as the thresholds for golden frame refreshing.
-  // Also, force this frame as a golden update frame if this frame will change
-  // the resolution (resize_pending != 0).
-  if (cpi->resize_pending != 0 ||
-      (cnt1 * 100 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
-    vp9_cyclic_refresh_set_golden_update(cpi);
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-
-    if (rc->frames_till_gf_update_due > rc->frames_to_key)
-      rc->frames_till_gf_update_due = rc->frames_to_key;
-    cpi->refresh_golden_frame = 1;
-    force_gf_refresh = 1;
-  }
-  fraction_low = (double)low_content_frame / (rows * cols);
-  // Update average.
-  cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
-  if (!force_gf_refresh && cpi->refresh_golden_frame == 1) {
-    // Don't update golden reference if the amount of low_content for the
-    // current encoded frame is small, or if the recursive average of the
-    // low_content over the update interval window falls below threshold.
-    if (fraction_low < 0.8 || cr->low_content_avg < 0.7)
-      cpi->refresh_golden_frame = 0;
-    // Reset for next internal.
-    cr->low_content_avg = fraction_low;
-  }
+  if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40)
+    rc->baseline_gf_interval = 10;
 }
 
 // Update the segmentation map, and related quantities: cyclic refresh map,
@@ -383,13 +355,14 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
           : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex);
   // More aggressive settings for noisy content.
   if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
-    consec_zero_mv_thresh = 80;
+    consec_zero_mv_thresh = 60;
     qindex_thresh =
         VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex),
-               7 * cm->base_qindex >> 3);
+               cm->base_qindex);
   }
   do {
     int sum_map = 0;
+    int consec_zero_mv_thresh_block = consec_zero_mv_thresh;
     // Get the mi_row/mi_col corresponding to superblock index i.
     int sb_row_index = (i / sb_cols);
     int sb_col_index = i - sb_row_index * sb_cols;
@@ -403,16 +376,19 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
         VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]);
     ymis =
         VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]);
+    if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium &&
+        (xmis <= 2 || ymis <= 2))
+      consec_zero_mv_thresh_block = 4;
     for (y = 0; y < ymis; y++) {
       for (x = 0; x < xmis; x++) {
         const int bl_index2 = bl_index + y * cm->mi_cols + x;
         // If the block is as a candidate for clean up then mark it
         // for possible boost/refresh (segment 1). The segment id may get
-        // reset to 0 later if block gets coded anything other than ZEROMV.
+        // reset to 0 later depending on the coding mode.
         if (cr->map[bl_index2] == 0) {
           count_tot++;
           if (cr->last_coded_q_map[bl_index2] > qindex_thresh ||
-              cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh) {
+              cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh_block) {
             sum_map++;
             count_sel++;
           }
@@ -445,9 +421,21 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int num8x8bl = cm->MBs << 2;
+  int target_refresh = 0;
+  double weight_segment_target = 0;
+  double weight_segment = 0;
+  int thresh_low_motion = (cm->width < 720) ? 55 : 20;
+  cr->apply_cyclic_refresh = 1;
+  if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 ||
+      (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion &&
+       rc->frames_since_key > 40)) {
+    cr->apply_cyclic_refresh = 0;
+    return;
+  }
   cr->percent_refresh = 10;
   if (cr->reduce_refresh) cr->percent_refresh = 5;
-  cr->max_qdelta_perc = 50;
+  cr->max_qdelta_perc = 60;
   cr->time_for_refresh = 0;
   cr->motion_thresh = 32;
   cr->rate_boost_fac = 15;
@@ -466,10 +454,15 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
       cr->rate_boost_fac = 13;
     }
   }
-  // Adjust some parameters for low resolutions at low bitrates.
-  if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) {
-    cr->motion_thresh = 4;
-    cr->rate_boost_fac = 10;
+  // Adjust some parameters for low resolutions.
+  if (cm->width <= 352 && cm->height <= 288) {
+    if (rc->avg_frame_bandwidth < 3000) {
+      cr->motion_thresh = 16;
+      cr->rate_boost_fac = 13;
+    } else {
+      cr->max_qdelta_perc = 70;
+      cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5);
+    }
   }
   if (cpi->svc.spatial_layer_id > 0) {
     cr->motion_thresh = 4;
@@ -487,6 +480,19 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
       cr->rate_ratio_qdelta = 1.0;
     }
   }
+  // Weight for segment prior to encoding: take the average of the target
+  // number for the frame to be encoded and the actual from the previous frame.
+  // Use the target if its less. To be used for setting the base qp for the
+  // frame in vp9_rc_regulate_q.
+  target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+  weight_segment_target = (double)(target_refresh) / num8x8bl;
+  weight_segment = (double)((target_refresh + cr->actual_num_seg1_blocks +
+                             cr->actual_num_seg2_blocks) >>
+                            1) /
+                   num8x8bl;
+  if (weight_segment_target < 7 * weight_segment / 8)
+    weight_segment = weight_segment_target;
+  cr->weight_segment = weight_segment;
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
@@ -495,14 +501,8 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   struct segmentation *const seg = &cm->seg;
-  // TODO(marpan): Look into whether we should reduce the amount/delta-qp
-  // instead of completely shutting off at low bitrates. For now keep it on.
-  // const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
-  const int apply_cyclic_refresh = 1;
   if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
-  // Don't apply refresh on key frame or temporal enhancement layer frames.
-  if (!apply_cyclic_refresh || (cm->frame_type == KEY_FRAME) ||
-      (cpi->force_update_segmentation) || (cpi->svc.temporal_layer_id > 0)) {
+  if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation)) {
     // Set segmentation map to 0 and disable.
     unsigned char *const seg_map = cpi->segmentation_map;
     memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
@@ -511,6 +511,7 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
       memset(cr->last_coded_q_map, MAXQ,
              cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
       cr->sb_index = 0;
+      cr->reduce_refresh = 0;
     }
     return;
   } else {
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
index a4be031295..77fa67c9e1 100644
--- a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -66,6 +66,8 @@ struct CYCLIC_REFRESH {
   double low_content_avg;
   int qindex_delta[3];
   int reduce_refresh;
+  double weight_segment;
+  int apply_cyclic_refresh;
 };
 
 struct VP9_COMP;
@@ -104,15 +106,15 @@ void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi,
 // refresh sb_index, and target number of blocks to be refreshed.
 void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi);
 
-// Update the actual number of blocks that were applied the segment delta q.
+// From the just encoded frame: update the actual number of blocks that were
+// applied the segment delta q, and the amount of low motion in the frame.
+// Also check conditions for forcing golden update, or preventing golden
+// update if the period is up.
 void vp9_cyclic_refresh_postencode(struct VP9_COMP *const cpi);
 
 // Set golden frame update interval, for non-svc 1 pass CBR mode.
 void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *const cpi);
 
-// Check if we should not update golden reference, based on past refresh stats.
-void vp9_cyclic_refresh_check_golden_update(struct VP9_COMP *const cpi);
-
 // Set/update global/frame level refresh parameters.
 void vp9_cyclic_refresh_update_parameters(struct VP9_COMP *const cpi);
 
diff --git a/libs/libvpx/vp9/encoder/vp9_bitstream.c b/libs/libvpx/vp9/encoder/vp9_bitstream.c
index 860da83330..d346cd57aa 100644
--- a/libs/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libs/libvpx/vp9/encoder/vp9_bitstream.c
@@ -80,8 +80,8 @@ static void prob_diff_update(const vpx_tree_index *tree,
     vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]);
 }
 
-static void write_selected_tx_size(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                   vpx_writer *w) {
+static void write_selected_tx_size(const VP9_COMMON *cm,
+                                   const MACROBLOCKD *const xd, vpx_writer *w) {
   TX_SIZE tx_size = xd->mi[0]->tx_size;
   BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
@@ -95,7 +95,7 @@ static void write_selected_tx_size(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   }
 }
 
-static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *const xd,
                       int segment_id, const MODE_INFO *mi, vpx_writer *w) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
@@ -195,7 +195,7 @@ static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
 }
 
 // This function encodes the reference frame
-static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *const xd,
                              vpx_writer *w) {
   const MODE_INFO *const mi = xd->mi[0];
   const int is_compound = has_second_ref(mi);
@@ -213,7 +213,7 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
       vpx_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd));
     } else {
-      assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE));
+      assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE));
     }
 
     if (is_compound) {
@@ -230,14 +230,15 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   }
 }
 
-static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
-                                vpx_writer *w) {
+static void pack_inter_mode_mvs(
+    VP9_COMP *cpi, const MACROBLOCKD *const xd,
+    const MB_MODE_INFO_EXT *const mbmi_ext, vpx_writer *w,
+    unsigned int *const max_mv_magnitude,
+    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
   VP9_COMMON *const cm = &cpi->common;
   const nmv_context *nmvc = &cm->fc->nmvc;
-  const MACROBLOCK *const x = &cpi->td.mb;
-  const MACROBLOCKD *const xd = &x->e_mbd;
   const struct segmentation *const seg = &cm->seg;
-  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const MODE_INFO *const mi = xd->mi[0];
   const PREDICTION_MODE mode = mi->mode;
   const int segment_id = mi->segment_id;
   const BLOCK_SIZE bsize = mi->sb_type;
@@ -299,7 +300,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
       vp9_write_token(w, vp9_switchable_interp_tree,
                       cm->fc->switchable_interp_prob[ctx],
                       &switchable_interp_encodings[mi->interp_filter]);
-      ++cpi->interp_filter_selected[0][mi->interp_filter];
+      ++interp_filter_selected[0][mi->interp_filter];
     } else {
       assert(mi->interp_filter == cm->interp_filter);
     }
@@ -317,7 +318,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
             for (ref = 0; ref < 1 + is_compound; ++ref)
               vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
                             &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv,
-                            nmvc, allow_hp);
+                            nmvc, allow_hp, max_mv_magnitude);
           }
         }
       }
@@ -326,16 +327,16 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
         for (ref = 0; ref < 1 + is_compound; ++ref)
           vp9_encode_mv(cpi, w, &mi->mv[ref].as_mv,
                         &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv, nmvc,
-                        allow_hp);
+                        allow_hp, max_mv_magnitude);
       }
     }
   }
 }
 
 static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                              MODE_INFO **mi_8x8, vpx_writer *w) {
+                              vpx_writer *w) {
   const struct segmentation *const seg = &cm->seg;
-  const MODE_INFO *const mi = mi_8x8[0];
+  const MODE_INFO *const mi = xd->mi[0];
   const MODE_INFO *const above_mi = xd->above_mi;
   const MODE_INFO *const left_mi = xd->left_mi;
   const BLOCK_SIZE bsize = mi->sb_type;
@@ -366,27 +367,27 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   write_intra_mode(w, mi->uv_mode, vp9_kf_uv_mode_prob[mi->mode]);
 }
 
-static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
-                          vpx_writer *w, TOKENEXTRA **tok,
-                          const TOKENEXTRA *const tok_end, int mi_row,
-                          int mi_col) {
+static void write_modes_b(
+    VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile,
+    vpx_writer *w, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+    int mi_row, int mi_col, unsigned int *const max_mv_magnitude,
+    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const MB_MODE_INFO_EXT *const mbmi_ext =
+      cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
   MODE_INFO *m;
 
   xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
   m = xd->mi[0];
 
-  cpi->td.mb.mbmi_ext =
-      cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
-
   set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[m->sb_type],
                  mi_col, num_8x8_blocks_wide_lookup[m->sb_type], cm->mi_rows,
                  cm->mi_cols);
   if (frame_is_intra_only(cm)) {
-    write_mb_modes_kf(cm, xd, xd->mi, w);
+    write_mb_modes_kf(cm, xd, w);
   } else {
-    pack_inter_mode_mvs(cpi, m, w);
+    pack_inter_mode_mvs(cpi, xd, mbmi_ext, w, max_mv_magnitude,
+                        interp_filter_selected);
   }
 
   assert(*tok < tok_end);
@@ -415,13 +416,13 @@ static void write_partition(const VP9_COMMON *const cm,
   }
 }
 
-static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
-                           vpx_writer *w, TOKENEXTRA **tok,
-                           const TOKENEXTRA *const tok_end, int mi_row,
-                           int mi_col, BLOCK_SIZE bsize) {
+static void write_modes_sb(
+    VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile,
+    vpx_writer *w, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+    int mi_row, int mi_col, BLOCK_SIZE bsize,
+    unsigned int *const max_mv_magnitude,
+    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-
   const int bsl = b_width_log2_lookup[bsize];
   const int bs = (1 << bsl) / 4;
   PARTITION_TYPE partition;
@@ -436,30 +437,37 @@ static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
   write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
   subsize = get_subsize(bsize, partition);
   if (subsize < BLOCK_8X8) {
-    write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+    write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+                  max_mv_magnitude, interp_filter_selected);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+                      max_mv_magnitude, interp_filter_selected);
         break;
       case PARTITION_HORZ:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+                      max_mv_magnitude, interp_filter_selected);
         if (mi_row + bs < cm->mi_rows)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col);
+          write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col,
+                        max_mv_magnitude, interp_filter_selected);
         break;
       case PARTITION_VERT:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+                      max_mv_magnitude, interp_filter_selected);
         if (mi_col + bs < cm->mi_cols)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs);
+          write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs,
+                        max_mv_magnitude, interp_filter_selected);
         break;
       case PARTITION_SPLIT:
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs,
-                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col,
-                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
-                       subsize);
+        write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, subsize,
+                       max_mv_magnitude, interp_filter_selected);
+        write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs,
+                       subsize, max_mv_magnitude, interp_filter_selected);
+        write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col,
+                       subsize, max_mv_magnitude, interp_filter_selected);
+        write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
+                       subsize, max_mv_magnitude, interp_filter_selected);
         break;
       default: assert(0);
     }
@@ -471,21 +479,32 @@ static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
-static void write_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                        vpx_writer *w, TOKENEXTRA **tok,
-                        const TOKENEXTRA *const tok_end) {
+static void write_modes(
+    VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile,
+    vpx_writer *w, int tile_row, int tile_col,
+    unsigned int *const max_mv_magnitude,
+    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-  int mi_row, mi_col;
+  int mi_row, mi_col, tile_sb_row;
+  TOKENEXTRA *tok = NULL;
+  TOKENEXTRA *tok_end = NULL;
 
   set_partition_probs(cm, xd);
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
+    tile_sb_row = mi_cols_aligned_to_sb(mi_row - tile->mi_row_start) >>
+                  MI_BLOCK_SIZE_LOG2;
+    tok = cpi->tplist[tile_row][tile_col][tile_sb_row].start;
+    tok_end = tok + cpi->tplist[tile_row][tile_col][tile_sb_row].count;
+
     vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, BLOCK_64X64);
+      write_modes_sb(cpi, xd, tile, w, &tok, tok_end, mi_row, mi_col,
+                     BLOCK_64X64, max_mv_magnitude, interp_filter_selected);
+
+    assert(tok == cpi->tplist[tile_row][tile_col][tile_sb_row].stop);
   }
 }
 
@@ -900,11 +919,130 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) {
   }
 }
 
+static int encode_tile_worker(void *arg1, void *arg2) {
+  VP9_COMP *cpi = (VP9_COMP *)arg1;
+  VP9BitstreamWorkerData *data = (VP9BitstreamWorkerData *)arg2;
+  MACROBLOCKD *const xd = &data->xd;
+  const int tile_row = 0;
+  vpx_start_encode(&data->bit_writer, data->dest);
+  write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info,
+              &data->bit_writer, tile_row, data->tile_idx,
+              &data->max_mv_magnitude, data->interp_filter_selected);
+  vpx_stop_encode(&data->bit_writer);
+  return 1;
+}
+
+void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) {
+  if (cpi->vp9_bitstream_worker_data) {
+    int i;
+    for (i = 1; i < cpi->num_workers; ++i) {
+      vpx_free(cpi->vp9_bitstream_worker_data[i].dest);
+    }
+    vpx_free(cpi->vp9_bitstream_worker_data);
+    cpi->vp9_bitstream_worker_data = NULL;
+  }
+}
+
+static int encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
+  int i;
+  const size_t worker_data_size =
+      cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data);
+  cpi->vp9_bitstream_worker_data = vpx_memalign(16, worker_data_size);
+  memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size);
+  if (!cpi->vp9_bitstream_worker_data) return 1;
+  for (i = 1; i < cpi->num_workers; ++i) {
+    cpi->vp9_bitstream_worker_data[i].dest_size =
+        cpi->oxcf.width * cpi->oxcf.height;
+    cpi->vp9_bitstream_worker_data[i].dest =
+        vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size);
+    if (!cpi->vp9_bitstream_worker_data[i].dest) return 1;
+  }
+  return 0;
+}
+
+static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int num_workers = cpi->num_workers;
+  size_t total_size = 0;
+  int tile_col = 0;
+
+  if (!cpi->vp9_bitstream_worker_data ||
+      cpi->vp9_bitstream_worker_data[1].dest_size >
+          (cpi->oxcf.width * cpi->oxcf.height)) {
+    vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+    if (encode_tiles_buffer_alloc(cpi)) return 0;
+  }
+
+  while (tile_col < tile_cols) {
+    int i, j;
+    for (i = 0; i < num_workers && tile_col < tile_cols; ++i) {
+      VPxWorker *const worker = &cpi->workers[i];
+      VP9BitstreamWorkerData *const data = &cpi->vp9_bitstream_worker_data[i];
+
+      // Populate the worker data.
+      data->xd = cpi->td.mb.e_mbd;
+      data->tile_idx = tile_col;
+      data->max_mv_magnitude = cpi->max_mv_magnitude;
+      memset(data->interp_filter_selected, 0,
+             sizeof(data->interp_filter_selected[0][0]) * SWITCHABLE);
+
+      // First thread can directly write into the output buffer.
+      if (i == 0) {
+        // If this worker happens to be for the last tile, then do not offset it
+        // by 4 for the tile size.
+        data->dest =
+            data_ptr + total_size + (tile_col == tile_cols - 1 ? 0 : 4);
+      }
+      worker->data1 = cpi;
+      worker->data2 = data;
+      worker->hook = encode_tile_worker;
+      worker->had_error = 0;
+
+      if (i < num_workers - 1) {
+        winterface->launch(worker);
+      } else {
+        winterface->execute(worker);
+      }
+      ++tile_col;
+    }
+    for (j = 0; j < i; ++j) {
+      VPxWorker *const worker = &cpi->workers[j];
+      VP9BitstreamWorkerData *const data =
+          (VP9BitstreamWorkerData *)worker->data2;
+      uint32_t tile_size;
+      int k;
+
+      if (!winterface->sync(worker)) return 0;
+      tile_size = data->bit_writer.pos;
+
+      // Aggregate per-thread bitstream stats.
+      cpi->max_mv_magnitude =
+          VPXMAX(cpi->max_mv_magnitude, data->max_mv_magnitude);
+      for (k = 0; k < SWITCHABLE; ++k) {
+        cpi->interp_filter_selected[0][k] += data->interp_filter_selected[0][k];
+      }
+
+      // Prefix the size of the tile on all but the last.
+      if (tile_col != tile_cols || j < i - 1) {
+        mem_put_be32(data_ptr + total_size, tile_size);
+        total_size += 4;
+      }
+      if (j > 0) {
+        memcpy(data_ptr + total_size, data->dest, tile_size);
+      }
+      total_size += tile_size;
+    }
+  }
+  return total_size;
+}
+
 static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   vpx_writer residual_bc;
   int tile_row, tile_col;
-  TOKENEXTRA *tok_end;
   size_t total_size = 0;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
@@ -912,22 +1050,27 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   memset(cm->above_seg_context, 0,
          sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
 
+  // Encoding tiles in parallel is done only for realtime mode now. In other
+  // modes the speed up is insignificant and requires further testing to ensure
+  // that it does not make the overall process worse in any case.
+  if (cpi->oxcf.mode == REALTIME && cpi->num_workers > 1 && tile_rows == 1 &&
+      tile_cols > 1) {
+    return encode_tiles_mt(cpi, data_ptr);
+  }
+
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
       int tile_idx = tile_row * tile_cols + tile_col;
-      TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
-
-      tok_end = cpi->tile_tok[tile_row][tile_col] +
-                cpi->tok_count[tile_row][tile_col];
 
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
         vpx_start_encode(&residual_bc, data_ptr + total_size + 4);
       else
         vpx_start_encode(&residual_bc, data_ptr + total_size);
 
-      write_modes(cpi, &cpi->tile_data[tile_idx].tile_info, &residual_bc, &tok,
-                  tok_end);
-      assert(tok == tok_end);
+      write_modes(cpi, xd, &cpi->tile_data[tile_idx].tile_info, &residual_bc,
+                  tile_row, tile_col, &cpi->max_mv_magnitude,
+                  cpi->interp_filter_selected);
+
       vpx_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
         // size of this tile
@@ -938,7 +1081,6 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
       total_size += residual_bc.pos;
     }
   }
-
   return total_size;
 }
 
diff --git a/libs/libvpx/vp9/encoder/vp9_bitstream.h b/libs/libvpx/vp9/encoder/vp9_bitstream.h
index 8c97d37f77..339c3fecb1 100644
--- a/libs/libvpx/vp9/encoder/vp9_bitstream.h
+++ b/libs/libvpx/vp9/encoder/vp9_bitstream.h
@@ -17,8 +17,24 @@ extern "C" {
 
 #include "vp9/encoder/vp9_encoder.h"
 
+typedef struct VP9BitstreamWorkerData {
+  uint8_t *dest;
+  int dest_size;
+  vpx_writer bit_writer;
+  int tile_idx;
+  unsigned int max_mv_magnitude;
+  // The size of interp_filter_selected in VP9_COMP is actually
+  // MAX_REFERENCE_FRAMES x SWITCHABLE. But when encoding tiles, all we ever do
+  // is increment the very first index (index 0) for the first dimension. Hence
+  // this is sufficient.
+  int interp_filter_selected[1][SWITCHABLE];
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+} VP9BitstreamWorkerData;
+
 int vp9_get_refresh_mask(VP9_COMP *cpi);
 
+void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi);
+
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
 
 static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
diff --git a/libs/libvpx/vp9/encoder/vp9_block.h b/libs/libvpx/vp9/encoder/vp9_block.h
index 1ea5fdf1ff..724205dd57 100644
--- a/libs/libvpx/vp9/encoder/vp9_block.h
+++ b/libs/libvpx/vp9/encoder/vp9_block.h
@@ -11,6 +11,8 @@
 #ifndef VP9_ENCODER_VP9_BLOCK_H_
 #define VP9_ENCODER_VP9_BLOCK_H_
 
+#include "vpx_util/vpx_thread.h"
+
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 
@@ -61,6 +63,11 @@ typedef struct {
 
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
+// cf. https://bugs.chromium.org/p/webm/issues/detail?id=1054
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  int64_t bsse[MAX_MB_PLANE << 2];
+#endif
+
   struct macroblock_plane plane[MAX_MB_PLANE];
 
   MACROBLOCKD e_mbd;
@@ -86,8 +93,6 @@ struct macroblock {
   int rddiv;
   int rdmult;
   int mb_energy;
-  int *m_search_count_ptr;
-  int *ex_search_count_ptr;
 
   // These are set to their default values at the beginning, and then adjusted
   // further in the encoding process.
@@ -118,6 +123,9 @@ struct macroblock {
   // Set during mode selection. Read during block encoding.
   uint8_t zcoeff_blk[TX_SIZES][256];
 
+  // Accumulate the tx block eobs in a partition block.
+  int32_t sum_y_eobs[TX_SIZES];
+
   int skip;
 
   int encode_breakout;
@@ -131,6 +139,10 @@ struct macroblock {
   int use_lp32x32fdct;
   int skip_encode;
 
+  // In first pass, intra prediction is done based on source pixels
+  // at tile boundaries
+  int fp_src_pred;
+
   // use fast quantization process
   int quant_fp;
 
@@ -140,7 +152,10 @@ struct macroblock {
 #define SKIP_TXFM_AC_DC 1
 #define SKIP_TXFM_AC_ONLY 2
 
+// cf. https://bugs.chromium.org/p/webm/issues/detail?id=1054
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
   int64_t bsse[MAX_MB_PLANE << 2];
+#endif
 
   // Used to store sub partition's choices.
   MV pred_mv[MAX_REF_FRAMES];
@@ -151,16 +166,38 @@ struct macroblock {
 
   uint8_t sb_is_skin;
 
+  uint8_t skip_low_source_sad;
+
+  uint8_t lowvar_highsumdiff;
+
+  uint8_t last_sb_high_content;
+
+  int sb_use_mv_part;
+
+  int sb_mvcol_part;
+
+  int sb_mvrow_part;
+
+  int sb_pickmode_part;
+
+  // For each superblock: saves the content value (e.g., low/high sad/sumdiff)
+  // based on source sad, prior to encoding the frame.
+  uint8_t content_state_sb;
+
   // Used to save the status of whether a block has a low variance in
   // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for
   // 32x32, 9~24 for 16x16.
   uint8_t variance_low[25];
 
-  void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
-  void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
+  uint8_t arf_frame_usage;
+  uint8_t lastgolden_frame_usage;
+
+  void (*fwd_txfm4x4)(const int16_t *input, tran_low_t *output, int stride);
+  void (*inv_txfm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
 #if CONFIG_VP9_HIGHBITDEPTH
-  void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
-                          int eob, int bd);
+  void (*highbd_inv_txfm_add)(const tran_low_t *input, uint16_t *dest,
+                              int stride, int eob, int bd);
 #endif
 };
 
diff --git a/libs/libvpx/vp9/encoder/vp9_context_tree.h b/libs/libvpx/vp9/encoder/vp9_context_tree.h
index 86ba03d69f..73423c0758 100644
--- a/libs/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/libs/libvpx/vp9/encoder/vp9_context_tree.h
@@ -65,12 +65,16 @@ typedef struct {
   int_mv best_sse_mv;
   MV_REFERENCE_FRAME best_reference_frame;
   MV_REFERENCE_FRAME best_zeromv_reference_frame;
+  int sb_skip_denoising;
 #endif
 
   // motion vector cache for adaptive motion search control in partition
   // search loop
   MV pred_mv[MAX_REF_FRAMES];
   INTERP_FILTER pred_interp_filter;
+
+  // Used for the machine learning-based early termination
+  int32_t sum_y_eobs;
 } PICK_MODE_CONTEXT;
 
 typedef struct PC_TREE {
diff --git a/libs/libvpx/vp9/encoder/vp9_dct.c b/libs/libvpx/vp9/encoder/vp9_dct.c
index bb8c23fdb9..5c66562a56 100644
--- a/libs/libvpx/vp9/encoder/vp9_dct.c
+++ b/libs/libvpx/vp9/encoder/vp9_dct.c
@@ -556,9 +556,8 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
 
 void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
                          tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *zbin_ptr,
-                         const int16_t *round_ptr, const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         int skip_block, const int16_t *round_ptr,
+                         const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                          uint16_t *eob_ptr, const int16_t *scan,
                          const int16_t *iscan) {
@@ -567,6 +566,8 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
   int i, j;
   tran_low_t intermediate[64];
 
+  (void)iscan;
+
   // Transform columns
   {
     tran_low_t *output = intermediate;
@@ -632,12 +633,6 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
     for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2;
   }
 
-  // TODO(jingning) Decide the need of these arguments after the
-  // quantization process is completed.
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)iscan;
-
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
diff --git a/libs/libvpx/vp9/encoder/vp9_denoiser.c b/libs/libvpx/vp9/encoder/vp9_denoiser.c
index 1d9a6702df..b08ccaa66c 100644
--- a/libs/libvpx/vp9/encoder/vp9_denoiser.c
+++ b/libs/libvpx/vp9/encoder/vp9_denoiser.c
@@ -185,33 +185,42 @@ static uint8_t *block_start(uint8_t *framebuf, int stride, int mi_row,
 }
 
 static VP9_DENOISER_DECISION perform_motion_compensation(
-    VP9_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs,
+    VP9_COMMON *const cm, VP9_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs,
     int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx,
-    int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv) {
-  int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
-  MV_REFERENCE_FRAME frame;
+    int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv,
+    int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx,
+    int use_svc, int spatial_layer) {
+  const int sse_diff = (ctx->newmv_sse == UINT_MAX)
+                           ? 0
+                           : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse);
+  int frame;
+  int denoise_layer_idx = 0;
   MACROBLOCKD *filter_mbd = &mb->e_mbd;
   MODE_INFO *mi = filter_mbd->mi[0];
   MODE_INFO saved_mi;
   int i;
   struct buf_2d saved_dst[MAX_MB_PLANE];
   struct buf_2d saved_pre[MAX_MB_PLANE];
+  RefBuffer *saved_block_refs[2];
+  MV_REFERENCE_FRAME saved_frame;
 
   frame = ctx->best_reference_frame;
+
   saved_mi = *mi;
 
   if (is_skin && (motion_magnitude > 0 || consec_zeromv < 4)) return COPY_BLOCK;
 
-  // Avoid denoising for small block (unless motion is small).
-  // Small blocks are selected in variance partition (before encoding) and
-  // will typically lie on moving areas.
-  if (denoiser->denoising_level < kDenHigh && motion_magnitude > 16 &&
-      bs <= BLOCK_8X8)
+  // Avoid denoising small blocks. When noise > kDenLow or frame width > 480,
+  // denoise 16x16 blocks.
+  if (bs == BLOCK_8X8 || bs == BLOCK_8X16 || bs == BLOCK_16X8 ||
+      (bs == BLOCK_16X16 && width > 480 &&
+       denoiser->denoising_level <= kDenLow))
     return COPY_BLOCK;
 
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
-  if (frame != INTRA_FRAME && ctx->newmv_sse != UINT_MAX &&
+  if (frame != INTRA_FRAME && frame != ALTREF_FRAME &&
+      (frame != GOLDEN_FRAME || num_spatial_layers == 1) &&
       sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
     mi->ref_frame[0] = ctx->best_reference_frame;
     mi->mode = ctx->best_sse_inter_mode;
@@ -221,9 +230,10 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
     frame = ctx->best_zeromv_reference_frame;
     ctx->newmv_sse = ctx->zeromv_sse;
     // Bias to last reference.
-    if (frame != LAST_FRAME &&
-        ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) ||
-         denoiser->denoising_level >= kDenHigh)) {
+    if (num_spatial_layers > 1 || frame == ALTREF_FRAME ||
+        (frame != LAST_FRAME &&
+         ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) ||
+          denoiser->denoising_level >= kDenHigh))) {
       frame = LAST_FRAME;
       ctx->newmv_sse = ctx->zeromv_lastref_sse;
     }
@@ -238,6 +248,19 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
     }
   }
 
+  saved_frame = frame;
+  // When using SVC, we need to map REF_FRAME to the frame buffer index.
+  if (use_svc) {
+    if (frame == LAST_FRAME)
+      frame = lst_fb_idx + 1;
+    else if (frame == GOLDEN_FRAME)
+      frame = gld_fb_idx + 1;
+    // Shift for the second spatial layer.
+    if (num_spatial_layers - spatial_layer == 2)
+      frame = frame + denoiser->num_ref_frames;
+    denoise_layer_idx = num_spatial_layers - spatial_layer - 1;
+  }
+
   if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
     // Restore everything to its original state
     *mi = saved_mi;
@@ -254,6 +277,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
     saved_pre[i] = filter_mbd->plane[i].pre[0];
     saved_dst[i] = filter_mbd->plane[i].dst;
   }
+  saved_block_refs[0] = filter_mbd->block_refs[0];
 
   // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
   // struct.
@@ -270,23 +294,28 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
                   denoiser->running_avg_y[frame].uv_stride, mi_row, mi_col);
   filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
 
-  filter_mbd->plane[0].dst.buf =
-      block_start(denoiser->mc_running_avg_y.y_buffer,
-                  denoiser->mc_running_avg_y.y_stride, mi_row, mi_col);
-  filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride;
-  filter_mbd->plane[1].dst.buf =
-      block_start(denoiser->mc_running_avg_y.u_buffer,
-                  denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col);
-  filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride;
-  filter_mbd->plane[2].dst.buf =
-      block_start(denoiser->mc_running_avg_y.v_buffer,
-                  denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col);
-  filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride;
+  filter_mbd->plane[0].dst.buf = block_start(
+      denoiser->mc_running_avg_y[denoise_layer_idx].y_buffer,
+      denoiser->mc_running_avg_y[denoise_layer_idx].y_stride, mi_row, mi_col);
+  filter_mbd->plane[0].dst.stride =
+      denoiser->mc_running_avg_y[denoise_layer_idx].y_stride;
+  filter_mbd->plane[1].dst.buf = block_start(
+      denoiser->mc_running_avg_y[denoise_layer_idx].u_buffer,
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
+  filter_mbd->plane[1].dst.stride =
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
+  filter_mbd->plane[2].dst.buf = block_start(
+      denoiser->mc_running_avg_y[denoise_layer_idx].v_buffer,
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
+  filter_mbd->plane[2].dst.stride =
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
 
+  set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
   vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs);
 
   // Restore everything to its original state
   *mi = saved_mi;
+  filter_mbd->block_refs[0] = saved_block_refs[0];
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     filter_mbd->plane[i].pre[0] = saved_pre[i];
     filter_mbd->plane[i].dst = saved_dst[i];
@@ -303,13 +332,22 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
   int zeromv_filter = 0;
   VP9_DENOISER *denoiser = &cpi->denoiser;
   VP9_DENOISER_DECISION decision = COPY_BLOCK;
-  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
-  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
+
+  const int shift =
+      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
+          ? denoiser->num_ref_frames
+          : 0;
+  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift];
+  const int denoise_layer_index =
+      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1;
+  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index];
   uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+
   uint8_t *mc_avg_start =
       block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col);
   struct buf_2d src = mb->plane[0].src;
   int is_skin = 0;
+  int increase_denoising = 0;
   int consec_zeromv = 0;
   mv_col = ctx->best_sse_mv.as_mv.col;
   mv_row = ctx->best_sse_mv.as_mv.row;
@@ -326,8 +364,8 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
       VP9_COMMON *const cm = &cpi->common;
       int j, i;
       // Loop through the 8x8 sub-blocks.
-      const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-      const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+      const int bw = num_8x8_blocks_wide_lookup[bs];
+      const int bh = num_8x8_blocks_high_lookup[bs];
       const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
       const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
       const int block_index = mi_row * cm->mi_cols + mi_col;
@@ -352,30 +390,28 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
         mb->plane[0].src.stride, mb->plane[1].src.stride, bs, consec_zeromv,
         motion_level);
   }
-  if (!is_skin && denoiser->denoising_level == kDenHigh) {
-    denoiser->increase_denoising = 1;
-  } else {
-    denoiser->increase_denoising = 0;
-  }
+  if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1;
 
-  if (denoiser->denoising_level >= kDenLow)
+  if (denoiser->denoising_level >= kDenLow && !ctx->sb_skip_denoising)
     decision = perform_motion_compensation(
-        denoiser, mb, bs, denoiser->increase_denoising, mi_row, mi_col, ctx,
-        motion_magnitude, is_skin, &zeromv_filter, consec_zeromv);
+        &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
+        motion_magnitude, is_skin, &zeromv_filter, consec_zeromv,
+        cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx,
+        cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id);
 
   if (decision == FILTER_BLOCK) {
-    decision = vp9_denoiser_filter(
-        src.buf, src.stride, mc_avg_start, mc_avg.y_stride, avg_start,
-        avg.y_stride, denoiser->increase_denoising, bs, motion_magnitude);
+    decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start,
+                                   mc_avg.y_stride, avg_start, avg.y_stride,
+                                   increase_denoising, bs, motion_magnitude);
   }
 
   if (decision == FILTER_BLOCK) {
-    vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0,
-                      NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2,
+    vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, 0,
+                      0, 0, num_4x4_blocks_wide_lookup[bs] << 2,
                       num_4x4_blocks_high_lookup[bs] << 2);
   } else {  // COPY_BLOCK
-    vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0,
-                      NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2,
+    vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, 0,
+                      0, 0, num_4x4_blocks_wide_lookup[bs] << 2,
                       num_4x4_blocks_high_lookup[bs] << 2);
   }
   *denoiser_decision = decision;
@@ -408,19 +444,23 @@ static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest,
   src->y_buffer = tmp_buf;
 }
 
-void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
-                                    YV12_BUFFER_CONFIG src,
-                                    FRAME_TYPE frame_type,
-                                    int refresh_alt_ref_frame,
-                                    int refresh_golden_frame,
-                                    int refresh_last_frame, int resized) {
+void vp9_denoiser_update_frame_info(
+    VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type,
+    int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame,
+    int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized,
+    int svc_base_is_key, int second_spatial_layer) {
+  const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0;
   // Copy source into denoised reference buffers on KEY_FRAME or
-  // if the just encoded frame was resized.
-  if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset) {
+  // if the just encoded frame was resized. For SVC, copy source if the base
+  // spatial layer was key frame.
+  if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset ||
+      svc_base_is_key) {
     int i;
     // Start at 1 so as not to overwrite the INTRA_FRAME
-    for (i = 1; i < MAX_REF_FRAMES; ++i)
-      copy_frame(&denoiser->running_avg_y[i], &src);
+    for (i = 1; i < denoiser->num_ref_frames; ++i) {
+      if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL)
+        copy_frame(&denoiser->running_avg_y[i + shift], &src);
+    }
     denoiser->reset = 0;
     return;
   }
@@ -428,29 +468,29 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
   // If more than one refresh occurs, must copy frame buffer.
   if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > 1) {
     if (refresh_alt_ref_frame) {
-      copy_frame(&denoiser->running_avg_y[ALTREF_FRAME],
-                 &denoiser->running_avg_y[INTRA_FRAME]);
+      copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
     if (refresh_golden_frame) {
-      copy_frame(&denoiser->running_avg_y[GOLDEN_FRAME],
-                 &denoiser->running_avg_y[INTRA_FRAME]);
+      copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
     if (refresh_last_frame) {
-      copy_frame(&denoiser->running_avg_y[LAST_FRAME],
-                 &denoiser->running_avg_y[INTRA_FRAME]);
+      copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
   } else {
     if (refresh_alt_ref_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
-                        &denoiser->running_avg_y[INTRA_FRAME]);
+      swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
     if (refresh_golden_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
-                        &denoiser->running_avg_y[INTRA_FRAME]);
+      swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
     if (refresh_last_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
-                        &denoiser->running_avg_y[INTRA_FRAME]);
+      swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
   }
 }
@@ -479,19 +519,110 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
   }
 }
 
-int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
-                       int ssy,
+static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm,
+                                           VP9_DENOISER *denoiser, int fb_idx) {
+  int fail = 0;
+  if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) {
+    fail =
+        vpx_alloc_frame_buffer(&denoiser->running_avg_y[fb_idx], cm->width,
+                               cm->height, cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, 0);
+    if (fail) {
+      vp9_denoiser_free(denoiser);
+      return 1;
+    }
+  }
+  return 0;
+}
+
+int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
+                             int svc_buf_shift, int refresh_alt,
+                             int refresh_gld, int refresh_lst, int alt_fb_idx,
+                             int gld_fb_idx, int lst_fb_idx) {
+  int fail = 0;
+  if (refresh_alt) {
+    // Increase the frame buffer index by 1 to map it to the buffer index in the
+    // denoiser.
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                           alt_fb_idx + 1 + svc_buf_shift);
+    if (fail) return 1;
+  }
+  if (refresh_gld) {
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                           gld_fb_idx + 1 + svc_buf_shift);
+    if (fail) return 1;
+  }
+  if (refresh_lst) {
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                           lst_fb_idx + 1 + svc_buf_shift);
+    if (fail) return 1;
+  }
+  return 0;
+}
+
+int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
+                       int use_svc, int noise_sen, int width, int height,
+                       int ssx, int ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                        int use_highbitdepth,
 #endif
                        int border) {
-  int i, fail;
+  int i, layer, fail, init_num_ref_frames;
   const int legacy_byte_alignment = 0;
+  int num_layers = 1;
+  int scaled_width = width;
+  int scaled_height = height;
+  if (use_svc) {
+    LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id *
+                                                svc->number_temporal_layers +
+                                            svc->temporal_layer_id];
+    get_layer_resolution(width, height, lc->scaling_factor_num,
+                         lc->scaling_factor_den, &scaled_width, &scaled_height);
+    // For SVC: only denoise at most 2 spatial (highest) layers.
+    if (noise_sen >= 2)
+      // Denoise from one spatial layer below the top.
+      svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 2, 0);
+    else
+      // Only denoise the top spatial layer.
+      svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 1, 0);
+    num_layers = svc->number_spatial_layers - svc->first_layer_denoise;
+  }
   assert(denoiser != NULL);
+  denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
+  init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES;
+  denoiser->num_layers = num_layers;
+  CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
+                  vpx_calloc(denoiser->num_ref_frames * num_layers,
+                             sizeof(denoiser->running_avg_y[0])));
+  CHECK_MEM_ERROR(
+      cm, denoiser->mc_running_avg_y,
+      vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
 
-  for (i = 0; i < MAX_REF_FRAMES; ++i) {
-    fail = vpx_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
-                                  ssx, ssy,
+  for (layer = 0; layer < num_layers; ++layer) {
+    const int denoise_width = (layer == 0) ? width : scaled_width;
+    const int denoise_height = (layer == 0) ? height : scaled_height;
+    for (i = 0; i < init_num_ref_frames; ++i) {
+      fail = vpx_alloc_frame_buffer(
+          &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer],
+          denoise_width, denoise_height, ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+          use_highbitdepth,
+#endif
+          border, legacy_byte_alignment);
+      if (fail) {
+        vp9_denoiser_free(denoiser);
+        return 1;
+      }
+#ifdef OUTPUT_YUV_DENOISED
+      make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+    }
+
+    fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y[layer],
+                                  denoise_width, denoise_height, ssx, ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                                   use_highbitdepth,
 #endif
@@ -500,22 +631,10 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
       vp9_denoiser_free(denoiser);
       return 1;
     }
-#ifdef OUTPUT_YUV_DENOISED
-    make_grayscale(&denoiser->running_avg_y[i]);
-#endif
-  }
-
-  fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height, ssx,
-                                ssy,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                use_highbitdepth,
-#endif
-                                border, legacy_byte_alignment);
-  if (fail) {
-    vp9_denoiser_free(denoiser);
-    return 1;
   }
 
+  // denoiser->last_source only used for noise_estimation, so only for top
+  // layer.
   fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                                 use_highbitdepth,
@@ -528,7 +647,6 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
 #ifdef OUTPUT_YUV_DENOISED
   make_grayscale(&denoiser->running_avg_y[i]);
 #endif
-  denoiser->increase_denoising = 0;
   denoiser->frame_buffer_initialized = 1;
   denoiser->denoising_level = kDenLow;
   denoiser->prev_denoising_level = kDenLow;
@@ -542,10 +660,18 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) {
     return;
   }
   denoiser->frame_buffer_initialized = 0;
-  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+  for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) {
     vpx_free_frame_buffer(&denoiser->running_avg_y[i]);
   }
-  vpx_free_frame_buffer(&denoiser->mc_running_avg_y);
+  vpx_free(denoiser->running_avg_y);
+  denoiser->running_avg_y = NULL;
+
+  for (i = 0; i < denoiser->num_layers; ++i) {
+    vpx_free_frame_buffer(&denoiser->mc_running_avg_y[i]);
+  }
+
+  vpx_free(denoiser->mc_running_avg_y);
+  denoiser->mc_running_avg_y = NULL;
   vpx_free_frame_buffer(&denoiser->last_source);
 }
 
@@ -559,6 +685,34 @@ void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level) {
   denoiser->prev_denoising_level = denoiser->denoising_level;
 }
 
+// Scale/increase the partition threshold
+// for denoiser speed-up.
+int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level,
+                              int content_state, int temporal_layer_id) {
+  if ((content_state == kLowSadLowSumdiff) ||
+      (content_state == kHighSadLowSumdiff) ||
+      (content_state == kLowVarHighSumdiff) || (noise_level == kDenHigh) ||
+      (temporal_layer_id != 0)) {
+    int64_t scaled_thr =
+        (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2;
+    return scaled_thr;
+  } else {
+    return (5 * threshold) >> 2;
+  }
+}
+
+//  Scale/increase the ac skip threshold for
+//  denoiser speed-up.
+int64_t vp9_scale_acskip_thresh(int64_t threshold,
+                                VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
+                                int temporal_layer_id) {
+  if (noise_level >= kDenLow && abs_sumdiff < 5)
+    return threshold *=
+           (noise_level == kDenLow) ? 2 : (temporal_layer_id == 2) ? 10 : 6;
+  else
+    return threshold;
+}
+
 #ifdef OUTPUT_YUV_DENOISED
 static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
   int r, c;
diff --git a/libs/libvpx/vp9/encoder/vp9_denoiser.h b/libs/libvpx/vp9/encoder/vp9_denoiser.h
index fcfaa5051a..f4da24cbf6 100644
--- a/libs/libvpx/vp9/encoder/vp9_denoiser.h
+++ b/libs/libvpx/vp9/encoder/vp9_denoiser.h
@@ -21,6 +21,14 @@ extern "C" {
 
 #define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
 
+// Denoiser is used in non svc real-time mode which does not use alt-ref, so no
+// need to allocate for it, and hence we need MAX_REF_FRAME - 1
+#define NONSVC_REF_FRAMES MAX_REF_FRAMES - 1
+
+// Number of frame buffers when SVC is used. [0] for current denoised buffer and
+// [1..8] for REF_FRAMES
+#define SVC_REF_FRAMES 9
+
 typedef enum vp9_denoiser_decision {
   COPY_BLOCK,
   FILTER_BLOCK,
@@ -35,12 +43,13 @@ typedef enum vp9_denoiser_level {
 } VP9_DENOISER_LEVEL;
 
 typedef struct vp9_denoiser {
-  YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
-  YV12_BUFFER_CONFIG mc_running_avg_y;
+  YV12_BUFFER_CONFIG *running_avg_y;
+  YV12_BUFFER_CONFIG *mc_running_avg_y;
   YV12_BUFFER_CONFIG last_source;
-  int increase_denoising;
   int frame_buffer_initialized;
   int reset;
+  int num_ref_frames;
+  int num_layers;
   VP9_DENOISER_LEVEL denoising_level;
   VP9_DENOISER_LEVEL prev_denoising_level;
 } VP9_DENOISER;
@@ -58,13 +67,13 @@ typedef struct {
 } VP9_PICKMODE_CTX_DEN;
 
 struct VP9_COMP;
+struct SVC;
 
-void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
-                                    YV12_BUFFER_CONFIG src,
-                                    FRAME_TYPE frame_type,
-                                    int refresh_alt_ref_frame,
-                                    int refresh_golden_frame,
-                                    int refresh_last_frame, int resized);
+void vp9_denoiser_update_frame_info(
+    VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type,
+    int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame,
+    int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized,
+    int svc_base_is_key, int second_spatial_layer);
 
 void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
                           int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
@@ -76,8 +85,14 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
                                      PREDICTION_MODE mode,
                                      PICK_MODE_CONTEXT *ctx);
 
-int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
-                       int ssy,
+int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
+                             int svc_buf_shift, int refresh_alt,
+                             int refresh_gld, int refresh_lst, int alt_fb_idx,
+                             int gld_fb_idx, int lst_fb_idx);
+
+int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
+                       int use_svc, int noise_sen, int width, int height,
+                       int ssx, int ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                        int use_highbitdepth,
 #endif
@@ -97,6 +112,13 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser);
 
 void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level);
 
+int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level,
+                              int content_state, int temporal_layer_id);
+
+int64_t vp9_scale_acskip_thresh(int64_t threshold,
+                                VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
+                                int temporal_layer_id);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libs/libvpx/vp9/encoder/vp9_encodeframe.c b/libs/libvpx/vp9/encoder/vp9_encodeframe.c
index 335faca82b..682477df18 100644
--- a/libs/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libs/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -52,6 +52,33 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
                               int output_enabled, int mi_row, int mi_col,
                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
 
+// Machine learning-based early termination parameters.
+static const double train_mean[24] = {
+  303501.697372, 3042630.372158, 24.694696, 1.392182,
+  689.413511,    162.027012,     1.478213,  0.0,
+  135382.260230, 912738.513263,  28.845217, 1.515230,
+  544.158492,    131.807995,     1.436863,  0.0,
+  43682.377587,  208131.711766,  28.084737, 1.356677,
+  138.254122,    119.522553,     1.252322,  0.0
+};
+
+static const double train_stdm[24] = {
+  673689.212982, 5996652.516628, 0.024449, 1.989792,
+  985.880847,    0.014638,       2.001898, 0.0,
+  208798.775332, 1812548.443284, 0.018693, 1.838009,
+  396.986910,    0.015657,       1.332541, 0.0,
+  55888.847031,  448587.962714,  0.017900, 1.904776,
+  98.652832,     0.016598,       1.320992, 0.0
+};
+
+// Error tolerance: 0.01%-0.0.05%-0.1%
+static const double classifiers[24] = {
+  0.111736, 0.289977, 0.042219, 0.204765, 0.120410, -0.143863,
+  0.282376, 0.847811, 0.637161, 0.131570, 0.018636, 0.202134,
+  0.112797, 0.028162, 0.182450, 1.124367, 0.386133, 0.083700,
+  0.050028, 0.150873, 0.061119, 0.109318, 0.127255, 0.625211
+};
+
 // This is used as a reference when computing the source variance for the
 //  purpose of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
@@ -98,19 +125,17 @@ static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
 };
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
-                                           const struct buf_2d *ref,
-                                           BLOCK_SIZE bs) {
+unsigned int vp9_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref,
+                                  BLOCK_SIZE bs) {
   unsigned int sse;
   const unsigned int var =
       cpi->fn_ptr[bs].vf(ref->buf, ref->stride, VP9_VAR_OFFS, 0, &sse);
-  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+  return var;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
-                                                const struct buf_2d *ref,
-                                                BLOCK_SIZE bs, int bd) {
+unsigned int vp9_high_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref,
+                                       BLOCK_SIZE bs, int bd) {
   unsigned int var, sse;
   switch (bd) {
     case 10:
@@ -130,7 +155,24 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
                              CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8), 0, &sse);
       break;
   }
-  return ROUND64_POWER_OF_TWO((int64_t)var, num_pels_log2_lookup[bs]);
+  return var;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs) {
+  return ROUND_POWER_OF_TWO(vp9_get_sby_variance(cpi, ref, bs),
+                            num_pels_log2_lookup[bs]);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd) {
+  return (unsigned int)ROUND64_POWER_OF_TWO(
+      (int64_t)vp9_high_get_sby_variance(cpi, ref, bs, bd),
+      num_pels_log2_lookup[bs]);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -259,8 +301,12 @@ static void set_block_size(VP9_COMP *const cpi, MACROBLOCK *const x,
 }
 
 typedef struct {
-  int64_t sum_square_error;
-  int64_t sum_error;
+  // This struct is used for computing variance in choose_partitioning(), where
+  // the max number of samples within a superblock is 16x16 (with 4x4 avg). Even
+  // in high bitdepth, uint32_t is enough for sum_square_error (2^12 * 2^12 * 16
+  // * 16 = 2^32).
+  uint32_t sum_square_error;
+  int32_t sum_error;
   int log2_count;
   int variance;
 } var;
@@ -353,7 +399,7 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
 }
 
 // Set variance values given sum square error, sum error, count.
-static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
+static void fill_variance(uint32_t s2, int32_t s, int c, var *v) {
   v->sum_square_error = s2;
   v->sum_error = s;
   v->log2_count = c;
@@ -461,24 +507,46 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x,
   return 0;
 }
 
+static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
+                                         int width, int height,
+                                         int content_state) {
+  if (speed >= 8) {
+    if (width <= 640 && height <= 480)
+      return (5 * threshold_base) >> 2;
+    else if ((content_state == kLowSadLowSumdiff) ||
+             (content_state == kHighSadLowSumdiff) ||
+             (content_state == kLowVarHighSumdiff))
+      return (5 * threshold_base) >> 2;
+  } else if (speed == 7) {
+    if ((content_state == kLowSadLowSumdiff) ||
+        (content_state == kHighSadLowSumdiff) ||
+        (content_state == kLowVarHighSumdiff)) {
+      return (5 * threshold_base) >> 2;
+    }
+  }
+  return threshold_base;
+}
+
 // Set the variance split thresholds for following the block sizes:
 // 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16,
 // 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is
 // currently only used on key frame.
-static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
+static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q,
+                               int content_state) {
   VP9_COMMON *const cm = &cpi->common;
   const int is_key_frame = (cm->frame_type == KEY_FRAME);
   const int threshold_multiplier = is_key_frame ? 20 : 1;
   int64_t threshold_base =
       (int64_t)(threshold_multiplier * cpi->y_dequant[q][1]);
+
   if (is_key_frame) {
     thresholds[0] = threshold_base;
     thresholds[1] = threshold_base >> 2;
     thresholds[2] = threshold_base >> 2;
     thresholds[3] = threshold_base << 2;
   } else {
-    // Increase base variance threshold based on  estimated noise level.
-    if (cpi->noise_estimate.enabled) {
+    // Increase base variance threshold based on estimated noise level.
+    if (cpi->noise_estimate.enabled && cm->width >= 640 && cm->height >= 480) {
       NOISE_LEVEL noise_level =
           vp9_noise_estimate_extract_level(&cpi->noise_estimate);
       if (noise_level == kHigh)
@@ -488,8 +556,25 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
       else if (noise_level < kLow)
         threshold_base = (7 * threshold_base) >> 3;
     }
+#if CONFIG_VP9_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+        cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow)
+      threshold_base =
+          vp9_scale_part_thresh(threshold_base, cpi->denoiser.denoising_level,
+                                content_state, cpi->svc.temporal_layer_id);
+    else
+      threshold_base =
+          scale_part_thresh_sumdiff(threshold_base, cpi->oxcf.speed, cm->width,
+                                    cm->height, content_state);
+#else
+    // Increase base variance threshold based on content_state/sum_diff level.
+    threshold_base = scale_part_thresh_sumdiff(
+        threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state);
+#endif
     thresholds[0] = threshold_base;
     thresholds[2] = threshold_base << cpi->oxcf.speed;
+    if (cm->width >= 1280 && cm->height >= 720 && cpi->oxcf.speed < 7)
+      thresholds[2] = thresholds[2] << 1;
     if (cm->width <= 352 && cm->height <= 288) {
       thresholds[0] = threshold_base >> 3;
       thresholds[1] = threshold_base >> 1;
@@ -504,7 +589,8 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
   }
 }
 
-void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) {
+void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q,
+                                           int content_state) {
   VP9_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
   const int is_key_frame = (cm->frame_type == KEY_FRAME);
@@ -512,10 +598,11 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) {
       sf->partition_search_type != REFERENCE_PARTITION) {
     return;
   } else {
-    set_vbp_thresholds(cpi, cpi->vbp_thresholds, q);
+    set_vbp_thresholds(cpi, cpi->vbp_thresholds, q, content_state);
     // The thresholds below are not changed locally.
     if (is_key_frame) {
       cpi->vbp_threshold_sad = 0;
+      cpi->vbp_threshold_copy = 0;
       cpi->vbp_bsize_min = BLOCK_8X8;
     } else {
       if (cm->width <= 352 && cm->height <= 288)
@@ -525,6 +612,14 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) {
                                      ? (cpi->y_dequant[q][1] << 1)
                                      : 1000;
       cpi->vbp_bsize_min = BLOCK_16X16;
+      if (cm->width <= 352 && cm->height <= 288)
+        cpi->vbp_threshold_copy = 4000;
+      else if (cm->width <= 640 && cm->height <= 360)
+        cpi->vbp_threshold_copy = 8000;
+      else
+        cpi->vbp_threshold_copy = (cpi->y_dequant[q][1] << 3) > 8000
+                                      ? (cpi->y_dequant[q][1] << 3)
+                                      : 8000;
     }
     cpi->vbp_threshold_minmax = 15 + (q >> 3);
   }
@@ -638,12 +733,14 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
   }
 }
 
-#if !CONFIG_VP9_HIGHBITDEPTH
 // Check if most of the superblock is skin content, and if so, force split to
 // 32x32, and set x->sb_is_skin for use in mode selection.
 static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
                          int mi_row, int mi_col, int *force_split) {
   VP9_COMMON *const cm = &cpi->common;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) return 0;
+#endif
   // Avoid checking superblocks on/near boundary and avoid low resolutions.
   // Note superblock may still pick 64X64 if y_sad is very small
   // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
@@ -666,16 +763,7 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
     for (i = 0; i < ymis; i += 2) {
       for (j = 0; j < xmis; j += 2) {
         int bl_index = block_index + i * cm->mi_cols + j;
-        int bl_index1 = bl_index + 1;
-        int bl_index2 = bl_index + cm->mi_cols;
-        int bl_index3 = bl_index2 + 1;
-        int consec_zeromv =
-            VPXMIN(cpi->consec_zero_mv[bl_index],
-                   VPXMIN(cpi->consec_zero_mv[bl_index1],
-                          VPXMIN(cpi->consec_zero_mv[bl_index2],
-                                 cpi->consec_zero_mv[bl_index3])));
-        int is_skin = vp9_compute_skin_block(
-            ysignal, usignal, vsignal, sp, spuv, BLOCK_16X16, consec_zeromv, 0);
+        int is_skin = cpi->skin_map[bl_index];
         num_16x16_skin += is_skin;
         num_16x16_nonskin += (1 - is_skin);
         if (num_16x16_nonskin > 3) {
@@ -698,7 +786,6 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
   }
   return 0;
 }
-#endif
 
 static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
                                   v64x64 *vt, int64_t thresholds[],
@@ -742,9 +829,13 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
           continue;
 
         if ((*this_mi)->sb_type == BLOCK_32X32) {
-          if (vt->split[i].part_variances.none.variance < (thresholds[1] >> 1))
+          int64_t threshold_32x32 = (cpi->sf.short_circuit_low_temp_var == 1 ||
+                                     cpi->sf.short_circuit_low_temp_var == 3)
+                                        ? ((5 * thresholds[1]) >> 3)
+                                        : (thresholds[1] >> 1);
+          if (vt->split[i].part_variances.none.variance < threshold_32x32)
             x->variance_low[i + 5] = 1;
-        } else if (cpi->sf.short_circuit_low_temp_var == 2) {
+        } else if (cpi->sf.short_circuit_low_temp_var >= 2) {
           // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
           // inside.
           if ((*this_mi)->sb_type == BLOCK_16X16 ||
@@ -762,12 +853,297 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
   }
 }
 
+static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x,
+                                     MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                     int mi_row, int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  BLOCK_SIZE *prev_part = cpi->prev_partition;
+  int start_pos = mi_row * cm->mi_stride + mi_col;
+
+  const int bsl = b_width_log2_lookup[bsize];
+  const int bs = (1 << bsl) >> 2;
+  BLOCK_SIZE subsize;
+  PARTITION_TYPE partition;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  partition = partition_lookup[bsl][prev_part[start_pos]];
+  subsize = get_subsize(bsize, partition);
+
+  if (subsize < BLOCK_8X8) {
+    set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+        break;
+      case PARTITION_HORZ:
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + bs, mi_col, subsize);
+        break;
+      case PARTITION_VERT:
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize);
+        break;
+      case PARTITION_SPLIT:
+        copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col);
+        copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col);
+        copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs);
+        copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs);
+        break;
+      default: assert(0);
+    }
+  }
+}
+
+static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                             int mi_row, int mi_col, int segment_id,
+                             int sb_offset) {
+  int svc_copy_allowed = 1;
+  int frames_since_key_thresh = 1;
+  if (cpi->use_svc) {
+    // For SVC, don't allow copy if base spatial layer is key frame, or if
+    // frame is not a temporal enhancement layer frame.
+    int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id,
+                                 cpi->svc.number_temporal_layers);
+    const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+    if (lc->is_key_frame || !cpi->svc.non_reference_frame) svc_copy_allowed = 0;
+    frames_since_key_thresh = cpi->svc.number_spatial_layers << 1;
+  }
+  if (cpi->rc.frames_since_key > frames_since_key_thresh && svc_copy_allowed &&
+      !cpi->resize_pending && segment_id == CR_SEGMENT_ID_BASE &&
+      cpi->prev_segment_id[sb_offset] == CR_SEGMENT_ID_BASE &&
+      cpi->copied_frame_cnt[sb_offset] < cpi->max_copied_frame) {
+    if (cpi->prev_partition != NULL) {
+      copy_partitioning_helper(cpi, x, xd, BLOCK_64X64, mi_row, mi_col);
+      cpi->copied_frame_cnt[sb_offset] += 1;
+      memcpy(x->variance_low, &(cpi->prev_variance_low[sb_offset * 25]),
+             sizeof(x->variance_low));
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                  int mi_row_high, int mi_col_high) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+  BLOCK_SIZE *prev_part = svc->prev_partition_svc;
+  // Variables with _high are for higher resolution.
+  int bsize_high = 0;
+  int subsize_high = 0;
+  const int bsl_high = b_width_log2_lookup[bsize];
+  const int bs_high = (1 << bsl_high) >> 2;
+  const int has_rows = (mi_row_high + bs_high) < cm->mi_rows;
+  const int has_cols = (mi_col_high + bs_high) < cm->mi_cols;
+
+  const int row_boundary_block_scale_factor[BLOCK_SIZES] = {
+    13, 13, 13, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0
+  };
+  const int col_boundary_block_scale_factor[BLOCK_SIZES] = {
+    13, 13, 13, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0
+  };
+  int start_pos;
+  BLOCK_SIZE bsize_low;
+  PARTITION_TYPE partition_high;
+
+  if (mi_row_high >= cm->mi_rows || mi_col_high >= cm->mi_cols) return 0;
+  if (mi_row >= (cm->mi_rows >> 1) || mi_col >= (cm->mi_cols >> 1)) return 0;
+
+  // Find corresponding (mi_col/mi_row) block down-scaled by 2x2.
+  start_pos = mi_row * (svc->mi_stride[svc->spatial_layer_id - 1]) + mi_col;
+  bsize_low = prev_part[start_pos];
+  // The block size is too big for boundaries. Do variance based partitioning.
+  if ((!has_rows || !has_cols) && bsize_low > BLOCK_16X16) return 1;
+
+  // For reference frames: return 1 (do variance-based partitioning) if the
+  // superblock is not low source sad and lower-resoln bsize is below 32x32.
+  if (!cpi->svc.non_reference_frame && !x->skip_low_source_sad &&
+      bsize_low < BLOCK_32X32)
+    return 1;
+
+  // Scale up block size by 2x2. Force 64x64 for size larger than 32x32.
+  if (bsize_low < BLOCK_32X32) {
+    bsize_high = bsize_low + 3;
+  } else if (bsize_low >= BLOCK_32X32) {
+    bsize_high = BLOCK_64X64;
+  }
+  // Scale up blocks on boundary.
+  if (!has_cols && has_rows) {
+    bsize_high = bsize_low + row_boundary_block_scale_factor[bsize_low];
+  } else if (has_cols && !has_rows) {
+    bsize_high = bsize_low + col_boundary_block_scale_factor[bsize_low];
+  } else if (!has_cols && !has_rows) {
+    bsize_high = bsize_low;
+  }
+
+  partition_high = partition_lookup[bsl_high][bsize_high];
+  subsize_high = get_subsize(bsize, partition_high);
+
+  if (subsize_high < BLOCK_8X8) {
+    set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high);
+  } else {
+    const int bsl = b_width_log2_lookup[bsize];
+    const int bs = (1 << bsl) >> 2;
+    switch (partition_high) {
+      case PARTITION_NONE:
+        set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high);
+        break;
+      case PARTITION_HORZ:
+        set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high);
+        if (subsize_high < BLOCK_64X64)
+          set_block_size(cpi, x, xd, mi_row_high + bs_high, mi_col_high,
+                         subsize_high);
+        break;
+      case PARTITION_VERT:
+        set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high);
+        if (subsize_high < BLOCK_64X64)
+          set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs_high,
+                         subsize_high);
+        break;
+      case PARTITION_SPLIT:
+        if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col,
+                                   mi_row_high, mi_col_high))
+          return 1;
+        if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1),
+                                   mi_col, mi_row_high + bs_high, mi_col_high))
+          return 1;
+        if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row,
+                                   mi_col + (bs >> 1), mi_row_high,
+                                   mi_col_high + bs_high))
+          return 1;
+        if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1),
+                                   mi_col + (bs >> 1), mi_row_high + bs_high,
+                                   mi_col_high + bs_high))
+          return 1;
+        break;
+      default: assert(0);
+    }
+  }
+
+  return 0;
+}
+
+static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                                 int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  BLOCK_SIZE *prev_part = cpi->svc.prev_partition_svc;
+  int start_pos = mi_row * cm->mi_stride + mi_col;
+  const int bsl = b_width_log2_lookup[bsize];
+  const int bs = (1 << bsl) >> 2;
+  BLOCK_SIZE subsize;
+  PARTITION_TYPE partition;
+  const MODE_INFO *mi = NULL;
+  int xx, yy;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  mi = cm->mi_grid_visible[start_pos];
+  partition = partition_lookup[bsl][mi->sb_type];
+  subsize = get_subsize(bsize, partition);
+  if (subsize < BLOCK_8X8) {
+    prev_part[start_pos] = bsize;
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        prev_part[start_pos] = bsize;
+        if (bsize == BLOCK_64X64) {
+          for (xx = 0; xx < 8; xx += 4)
+            for (yy = 0; yy < 8; yy += 4) {
+              if ((mi_row + xx < cm->mi_rows) && (mi_col + yy < cm->mi_cols))
+                prev_part[start_pos + xx * cm->mi_stride + yy] = bsize;
+            }
+        }
+        break;
+      case PARTITION_HORZ:
+        prev_part[start_pos] = subsize;
+        if (mi_row + bs < cm->mi_rows)
+          prev_part[start_pos + bs * cm->mi_stride] = subsize;
+        break;
+      case PARTITION_VERT:
+        prev_part[start_pos] = subsize;
+        if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
+        break;
+      case PARTITION_SPLIT:
+        update_partition_svc(cpi, subsize, mi_row, mi_col);
+        update_partition_svc(cpi, subsize, mi_row + bs, mi_col);
+        update_partition_svc(cpi, subsize, mi_row, mi_col + bs);
+        update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs);
+        break;
+      default: assert(0);
+    }
+  }
+}
+
+static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize,
+                                         int mi_row, int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  BLOCK_SIZE *prev_part = cpi->prev_partition;
+  int start_pos = mi_row * cm->mi_stride + mi_col;
+  const int bsl = b_width_log2_lookup[bsize];
+  const int bs = (1 << bsl) >> 2;
+  BLOCK_SIZE subsize;
+  PARTITION_TYPE partition;
+  const MODE_INFO *mi = NULL;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  mi = cm->mi_grid_visible[start_pos];
+  partition = partition_lookup[bsl][mi->sb_type];
+  subsize = get_subsize(bsize, partition);
+  if (subsize < BLOCK_8X8) {
+    prev_part[start_pos] = bsize;
+  } else {
+    switch (partition) {
+      case PARTITION_NONE: prev_part[start_pos] = bsize; break;
+      case PARTITION_HORZ:
+        prev_part[start_pos] = subsize;
+        if (mi_row + bs < cm->mi_rows)
+          prev_part[start_pos + bs * cm->mi_stride] = subsize;
+        break;
+      case PARTITION_VERT:
+        prev_part[start_pos] = subsize;
+        if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
+        break;
+      case PARTITION_SPLIT:
+        update_prev_partition_helper(cpi, subsize, mi_row, mi_col);
+        update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col);
+        update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs);
+        update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs);
+        break;
+      default: assert(0);
+    }
+  }
+}
+
+static void update_prev_partition(VP9_COMP *cpi, MACROBLOCK *x, int segment_id,
+                                  int mi_row, int mi_col, int sb_offset) {
+  update_prev_partition_helper(cpi, BLOCK_64X64, mi_row, mi_col);
+  cpi->prev_segment_id[sb_offset] = segment_id;
+  memcpy(&(cpi->prev_variance_low[sb_offset * 25]), x->variance_low,
+         sizeof(x->variance_low));
+  // Reset the counter for copy partitioning
+  cpi->copied_frame_cnt[sb_offset] = 0;
+}
+
 static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize,
                          unsigned int y_sad, int is_key_frame) {
   int i;
   MACROBLOCKD *xd = &x->e_mbd;
+
   if (is_key_frame) return;
 
+  // For speed >= 8, avoid the chroma check if y_sad is above threshold.
+  if (cpi->oxcf.speed >= 8) {
+    if (y_sad > cpi->vbp_thresholds[1] &&
+        (!cpi->noise_estimate.enabled ||
+         vp9_noise_estimate_extract_level(&cpi->noise_estimate) < kMedium))
+      return;
+  }
+
   for (i = 1; i <= 2; ++i) {
     unsigned int uv_sad = UINT_MAX;
     struct macroblock_plane *p = &x->plane[i];
@@ -784,6 +1160,55 @@ static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize,
   }
 }
 
+static uint64_t avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
+                               int sb_offset) {
+  unsigned int tmp_sse;
+  uint64_t tmp_sad;
+  unsigned int tmp_variance;
+  const BLOCK_SIZE bsize = BLOCK_64X64;
+  uint8_t *src_y = cpi->Source->y_buffer;
+  int src_ystride = cpi->Source->y_stride;
+  uint8_t *last_src_y = cpi->Last_Source->y_buffer;
+  int last_src_ystride = cpi->Last_Source->y_stride;
+  uint64_t avg_source_sad_threshold = 10000;
+  uint64_t avg_source_sad_threshold2 = 12000;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cpi->common.use_highbitdepth) return 0;
+#endif
+  src_y += shift;
+  last_src_y += shift;
+  tmp_sad =
+      cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, last_src_ystride);
+  tmp_variance = vpx_variance64x64(src_y, src_ystride, last_src_y,
+                                   last_src_ystride, &tmp_sse);
+  // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
+  if (tmp_sad < avg_source_sad_threshold)
+    x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kLowSadLowSumdiff
+                                                          : kLowSadHighSumdiff;
+  else
+    x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kHighSadLowSumdiff
+                                                          : kHighSadHighSumdiff;
+
+  // Detect large lighting change.
+  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+      cpi->oxcf.rc_mode == VPX_CBR && tmp_variance < (tmp_sse >> 3) &&
+      (tmp_sse - tmp_variance) > 10000)
+    x->content_state_sb = kLowVarHighSumdiff;
+  else if (tmp_sad > (avg_source_sad_threshold << 1))
+    x->content_state_sb = kVeryHighSad;
+
+  if (cpi->content_state_sb_fd != NULL) {
+    if (tmp_sad < avg_source_sad_threshold2) {
+      // Cap the increment to 255.
+      if (cpi->content_state_sb_fd[sb_offset] < 255)
+        cpi->content_state_sb_fd[sb_offset]++;
+    } else {
+      cpi->content_state_sb_fd[sb_offset] = 0;
+    }
+  }
+  return tmp_sad;
+}
+
 // This function chooses partitioning based on the variance between source and
 // reconstructed last, where variance is computed for down-sampled inputs.
 static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
@@ -792,14 +1217,23 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   MACROBLOCKD *xd = &x->e_mbd;
   int i, j, k, m;
   v64x64 vt;
-  v16x16 vt2[16];
+  v16x16 *vt2 = NULL;
   int force_split[21];
   int avg_32x32;
+  int max_var_32x32 = 0;
+  int min_var_32x32 = INT_MAX;
+  int var_32x32;
   int avg_16x16[4];
+  int maxvar_16x16[4];
+  int minvar_16x16[4];
+  int64_t threshold_4x4avg;
+  NOISE_LEVEL noise_level = kLow;
+  int content_state = 0;
   uint8_t *s;
   const uint8_t *d;
   int sp;
   int dp;
+  int compute_minmax_variance = 1;
   unsigned int y_sad = UINT_MAX;
   BLOCK_SIZE bsize = BLOCK_64X64;
   // Ref frame used in partitioning.
@@ -819,17 +1253,61 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   const int low_res = (cm->width <= 352 && cm->height <= 288);
   int variance4x4downsample[16];
   int segment_id;
+  int sb_offset = (cm->mi_stride >> 3) * (mi_row >> 3) + (mi_col >> 3);
 
   set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
   segment_id = xd->mi[0]->segment_id;
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
-    if (cyclic_refresh_segment_id_boosted(segment_id)) {
-      int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-      set_vbp_thresholds(cpi, thresholds, q);
+
+  if (cpi->oxcf.speed >= 8 || (cpi->use_svc && cpi->svc.non_reference_frame))
+    compute_minmax_variance = 0;
+
+  memset(x->variance_low, 0, sizeof(x->variance_low));
+
+  if (cpi->sf.use_source_sad && !is_key_frame) {
+    int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+    content_state = x->content_state_sb;
+    x->skip_low_source_sad = (content_state == kLowSadLowSumdiff ||
+                              content_state == kLowSadHighSumdiff)
+                                 ? 1
+                                 : 0;
+    x->lowvar_highsumdiff = (content_state == kLowVarHighSumdiff) ? 1 : 0;
+    if (cpi->content_state_sb_fd != NULL)
+      x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2];
+
+    // For SVC on top spatial layer: use/scale the partition from
+    // the lower spatial resolution if svc_use_lowres_part is enabled.
+    if (cpi->sf.svc_use_lowres_part &&
+        cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+        cpi->svc.prev_partition_svc != NULL && content_state != kVeryHighSad) {
+      if (!scale_partitioning_svc(cpi, x, xd, BLOCK_64X64, mi_row >> 1,
+                                  mi_col >> 1, mi_row, mi_col)) {
+        if (cpi->sf.copy_partition_flag) {
+          update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
+        }
+        return 0;
+      }
+    }
+    // If source_sad is low copy the partition without computing the y_sad.
+    if (x->skip_low_source_sad && cpi->sf.copy_partition_flag &&
+        copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
+      x->sb_use_mv_part = 1;
+      if (cpi->sf.svc_use_lowres_part &&
+          cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+        update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+      return 0;
     }
   }
 
-  memset(x->variance_low, 0, sizeof(x->variance_low));
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+      cyclic_refresh_segment_id_boosted(segment_id)) {
+    int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+    set_vbp_thresholds(cpi, thresholds, q, content_state);
+  } else {
+    set_vbp_thresholds(cpi, thresholds, cm->base_qindex, content_state);
+  }
+
+  // For non keyframes, disable 4x4 average for low resolution when speed = 8
+  threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX;
 
   if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
   if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
@@ -846,10 +1324,10 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     // that the temporal reference frame will always be of type LAST_FRAME.
     // TODO(marpan): If that assumption is broken, we need to revisit this code.
     MODE_INFO *mi = xd->mi[0];
-    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+    YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
 
     const YV12_BUFFER_CONFIG *yv12_g = NULL;
-    unsigned int y_sad_g, y_sad_thr;
+    unsigned int y_sad_g, y_sad_thr, y_sad_last;
     bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 +
             (mi_row + 4 < cm->mi_rows);
 
@@ -861,7 +1339,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
     }
 
-    if (yv12_g && yv12_g != yv12 && (cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
+    // Only compute y_sad_g (sad for golden reference) for speed < 8.
+    if (cpi->oxcf.speed < 8 && yv12_g && yv12_g != yv12 &&
+        (cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
       vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            &cm->frame_refs[GOLDEN_FRAME - 1].sf);
       y_sad_g = cpi->fn_ptr[bsize].sdf(
@@ -871,15 +1351,36 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       y_sad_g = UINT_MAX;
     }
 
-    vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
-                         &cm->frame_refs[LAST_FRAME - 1].sf);
-    mi->ref_frame[0] = LAST_FRAME;
+    if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+        cpi->rc.is_src_frame_alt_ref) {
+      yv12 = get_ref_frame_buffer(cpi, ALTREF_FRAME);
+      vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                           &cm->frame_refs[ALTREF_FRAME - 1].sf);
+      mi->ref_frame[0] = ALTREF_FRAME;
+      y_sad_g = UINT_MAX;
+    } else {
+      vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                           &cm->frame_refs[LAST_FRAME - 1].sf);
+      mi->ref_frame[0] = LAST_FRAME;
+    }
     mi->ref_frame[1] = NONE;
     mi->sb_type = BLOCK_64X64;
     mi->mv[0].as_int = 0;
     mi->interp_filter = BILINEAR;
 
-    y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+    if (cpi->oxcf.speed >= 8 && !low_res &&
+        x->content_state_sb != kVeryHighSad) {
+      y_sad = cpi->fn_ptr[bsize].sdf(
+          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+          xd->plane[0].pre[0].stride);
+    } else {
+      y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+      x->sb_use_mv_part = 1;
+      x->sb_mvcol_part = mi->mv[0].as_mv.col;
+      x->sb_mvrow_part = mi->mv[0].as_mv.row;
+    }
+
+    y_sad_last = y_sad;
     // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad
     // are close if short_circuit_low_temp_var is on.
     y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad;
@@ -898,12 +1399,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
-    x->sb_is_skin = 0;
-#if !CONFIG_VP9_HIGHBITDEPTH
     if (cpi->use_skin_detection)
       x->sb_is_skin =
           skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
-#endif
 
     d = xd->plane[0].dst.buf;
     dp = xd->plane[0].dst.stride;
@@ -916,10 +1414,30 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       if (mi_col + block_width / 2 < cm->mi_cols &&
           mi_row + block_height / 2 < cm->mi_rows) {
         set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64);
+        x->variance_low[0] = 1;
         chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+        if (cpi->sf.svc_use_lowres_part &&
+            cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+          update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+        if (cpi->sf.copy_partition_flag) {
+          update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
+        }
         return 0;
       }
     }
+
+    // If the y_sad is small enough, copy the partition of the superblock in the
+    // last frame to current frame only if the last frame is not a keyframe.
+    // Stop the copy every cpi->max_copied_frame to refresh the partition.
+    // TODO(jianj) : tune the threshold.
+    if (cpi->sf.copy_partition_flag && y_sad_last < cpi->vbp_threshold_copy &&
+        copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
+      chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+      if (cpi->sf.svc_use_lowres_part &&
+          cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+        update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+      return 0;
+    }
   } else {
     d = VP9_VAR_OFFS;
     dp = 0;
@@ -935,6 +1453,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
+  if (low_res && threshold_4x4avg < INT64_MAX)
+    CHECK_MEM_ERROR(cm, vt2, vpx_calloc(16, sizeof(*vt2)));
   // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
   // for splits.
   for (i = 0; i < 4; i++) {
@@ -943,6 +1463,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     const int i2 = i << 2;
     force_split[i + 1] = 0;
     avg_16x16[i] = 0;
+    maxvar_16x16[i] = 0;
+    minvar_16x16[i] = INT_MAX;
     for (j = 0; j < 4; j++) {
       const int x16_idx = x32_idx + ((j & 1) << 4);
       const int y16_idx = y32_idx + ((j >> 1) << 4);
@@ -959,13 +1481,17 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
         fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
         get_variance(&vt.split[i].split[j].part_variances.none);
         avg_16x16[i] += vt.split[i].split[j].part_variances.none.variance;
+        if (vt.split[i].split[j].part_variances.none.variance < minvar_16x16[i])
+          minvar_16x16[i] = vt.split[i].split[j].part_variances.none.variance;
+        if (vt.split[i].split[j].part_variances.none.variance > maxvar_16x16[i])
+          maxvar_16x16[i] = vt.split[i].split[j].part_variances.none.variance;
         if (vt.split[i].split[j].part_variances.none.variance > thresholds[2]) {
           // 16X16 variance is above threshold for split, so force split to 8x8
           // for this 16x16 block (this also forces splits for upper levels).
           force_split[split_index] = 1;
           force_split[i + 1] = 1;
           force_split[0] = 1;
-        } else if (cpi->oxcf.speed < 8 &&
+        } else if (compute_minmax_variance &&
                    vt.split[i].split[j].part_variances.none.variance >
                        thresholds[1] &&
                    !cyclic_refresh_segment_id_boosted(segment_id)) {
@@ -977,7 +1503,10 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
                                           xd->cur_buf->flags,
 #endif
                                           pixels_wide, pixels_high);
-          if (minmax > cpi->vbp_threshold_minmax) {
+          int thresh_minmax = (int)cpi->vbp_threshold_minmax;
+          if (x->content_state_sb == kVeryHighSad)
+            thresh_minmax = thresh_minmax << 1;
+          if (minmax > thresh_minmax) {
             force_split[split_index] = 1;
             force_split[i + 1] = 1;
             force_split[0] = 1;
@@ -986,7 +1515,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       }
       if (is_key_frame || (low_res &&
                            vt.split[i].split[j].part_variances.none.variance >
-                               (thresholds[1] << 1))) {
+                               threshold_4x4avg)) {
         force_split[split_index] = 0;
         // Go down to 4x4 down-sampling for variance.
         variance4x4downsample[i2 + j] = 1;
@@ -1003,6 +1532,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       }
     }
   }
+  if (cpi->noise_estimate.enabled)
+    noise_level = vp9_noise_estimate_extract_level(&cpi->noise_estimate);
   // Fill the rest of the variance tree by summing split partition values.
   avg_32x32 = 0;
   for (i = 0; i < 4; i++) {
@@ -1029,14 +1560,22 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     // (64x64) level.
     if (!force_split[i + 1]) {
       get_variance(&vt.split[i].part_variances.none);
+      var_32x32 = vt.split[i].part_variances.none.variance;
+      max_var_32x32 = VPXMAX(var_32x32, max_var_32x32);
+      min_var_32x32 = VPXMIN(var_32x32, min_var_32x32);
       if (vt.split[i].part_variances.none.variance > thresholds[1] ||
           (!is_key_frame &&
            vt.split[i].part_variances.none.variance > (thresholds[1] >> 1) &&
            vt.split[i].part_variances.none.variance > (avg_16x16[i] >> 1))) {
         force_split[i + 1] = 1;
         force_split[0] = 1;
+      } else if (!is_key_frame && noise_level < kLow && cm->height <= 360 &&
+                 (maxvar_16x16[i] - minvar_16x16[i]) > (thresholds[1] >> 1) &&
+                 maxvar_16x16[i] > thresholds[1]) {
+        force_split[i + 1] = 1;
+        force_split[0] = 1;
       }
-      avg_32x32 += vt.split[i].part_variances.none.variance;
+      avg_32x32 += var_32x32;
     }
   }
   if (!force_split[0]) {
@@ -1044,7 +1583,17 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     get_variance(&vt.part_variances.none);
     // If variance of this 64x64 block is above (some threshold of) the average
     // variance over the sub-32x32 blocks, then force this block to split.
-    if (!is_key_frame && vt.part_variances.none.variance > (5 * avg_32x32) >> 4)
+    // Only checking this for noise level >= medium for now.
+    if (!is_key_frame && noise_level >= kMedium &&
+        vt.part_variances.none.variance > (9 * avg_32x32) >> 5)
+      force_split[0] = 1;
+    // Else if the maximum 32x32 variance minus the miniumum 32x32 variance in
+    // a 64x64 block is greater than threshold and the maximum 32x32 variance is
+    // above a miniumum threshold, then force the split of a 64x64 block
+    // Only check this for low noise.
+    else if (!is_key_frame && noise_level < kMedium &&
+             (max_var_32x32 - min_var_32x32) > 3 * (thresholds[0] >> 3) &&
+             max_var_32x32 > thresholds[0] >> 1)
       force_split[0] = 1;
   }
 
@@ -1099,12 +1648,21 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     }
   }
 
+  if (cm->frame_type != KEY_FRAME && cpi->sf.copy_partition_flag) {
+    update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
+  }
+
+  if (cm->frame_type != KEY_FRAME && cpi->sf.svc_use_lowres_part &&
+      cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+    update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+
   if (cpi->sf.short_circuit_low_temp_var) {
     set_low_temp_var_flag(cpi, x, xd, &vt, thresholds, ref_frame_partition,
                           mi_col, mi_row);
   }
 
   chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+  if (vt2) vpx_free(vt2);
   return 0;
 }
 
@@ -1863,7 +2421,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
     }
   }
 
-  if (cm->use_prev_frame_mvs ||
+  if (cm->use_prev_frame_mvs || !cm->error_resilient_mode ||
       (cpi->svc.use_base_mv && cpi->svc.number_spatial_layers > 1 &&
        cpi->svc.spatial_layer_id != cpi->svc.number_spatial_layers - 1)) {
     MV_REF *const frame_mvs =
@@ -2460,6 +3018,74 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
 }
 #endif
 
+// Calculate the score used in machine-learning based partition search early
+// termination.
+static double compute_score(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                            PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize) {
+  const double *clf;
+  const double *mean;
+  const double *sd;
+  const int mag_mv =
+      abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row);
+  const int left_in_image = !!xd->left_mi;
+  const int above_in_image = !!xd->above_mi;
+  MODE_INFO **prev_mi =
+      &cm->prev_mi_grid_visible[mi_col + cm->mi_stride * mi_row];
+  int above_par = 0;  // above_partitioning
+  int left_par = 0;   // left_partitioning
+  int last_par = 0;   // last_partitioning
+  BLOCK_SIZE context_size;
+  double score;
+  int offset = 0;
+
+  assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
+
+  if (above_in_image) {
+    context_size = xd->above_mi->sb_type;
+    if (context_size < bsize)
+      above_par = 2;
+    else if (context_size == bsize)
+      above_par = 1;
+  }
+
+  if (left_in_image) {
+    context_size = xd->left_mi->sb_type;
+    if (context_size < bsize)
+      left_par = 2;
+    else if (context_size == bsize)
+      left_par = 1;
+  }
+
+  if (prev_mi) {
+    context_size = prev_mi[0]->sb_type;
+    if (context_size < bsize)
+      last_par = 2;
+    else if (context_size == bsize)
+      last_par = 1;
+  }
+
+  if (bsize == BLOCK_64X64)
+    offset = 0;
+  else if (bsize == BLOCK_32X32)
+    offset = 8;
+  else if (bsize == BLOCK_16X16)
+    offset = 16;
+
+  // early termination score calculation
+  clf = &classifiers[offset];
+  mean = &train_mean[offset];
+  sd = &train_stdm[offset];
+  score = clf[0] * (((double)ctx->rate - mean[0]) / sd[0]) +
+          clf[1] * (((double)ctx->dist - mean[1]) / sd[1]) +
+          clf[2] * (((double)mag_mv / 2 - mean[2]) * sd[2]) +
+          clf[3] * (((double)(left_par + above_par) / 2 - mean[3]) * sd[3]) +
+          clf[4] * (((double)ctx->sum_y_eobs - mean[4]) / sd[4]) +
+          clf[5] * (((double)cm->base_qindex - mean[5]) * sd[5]) +
+          clf[6] * (((double)last_par - mean[6]) * sd[6]) + clf[7];
+  return score;
+}
+
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
@@ -2505,8 +3131,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   int partition_vert_allowed =
       !force_horz_split && xss <= yss && bsize >= BLOCK_8X8;
 
-  int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr;
-  int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr;
+  int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_thr.dist;
+  int rate_breakout_thr = cpi->sf.partition_search_breakout_thr.rate;
 
   (void)*tp_orig;
 
@@ -2636,18 +3262,34 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       }
 
       if (this_rdc.rdcost < best_rdc.rdcost) {
+        MODE_INFO *mi = xd->mi[0];
+
         best_rdc = this_rdc;
         if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
 
-        // If all y, u, v transform blocks in this partition are skippable, and
-        // the dist & rate are within the thresholds, the partition search is
-        // terminated for current branch of the partition search tree.
-        if (!x->e_mbd.lossless && ctx->skippable &&
-            ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
-             (best_rdc.dist < dist_breakout_thr &&
-              best_rdc.rate < rate_breakout_thr))) {
-          do_split = 0;
-          do_rect = 0;
+        if (!cpi->sf.ml_partition_search_early_termination) {
+          // If all y, u, v transform blocks in this partition are skippable,
+          // and the dist & rate are within the thresholds, the partition search
+          // is terminated for current branch of the partition search tree.
+          if (!x->e_mbd.lossless && ctx->skippable &&
+              ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+               (best_rdc.dist < dist_breakout_thr &&
+                best_rdc.rate < rate_breakout_thr))) {
+            do_split = 0;
+            do_rect = 0;
+          }
+        } else {
+          // Currently, the machine-learning based partition search early
+          // termination is only used while bsize is 16x16, 32x32 or 64x64,
+          // VPXMIN(cm->width, cm->height) >= 480, and speed = 0.
+          if (!x->e_mbd.lossless &&
+              !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP) &&
+              ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) {
+            if (compute_score(cm, xd, ctx, mi_row, mi_col, bsize) < 0.0) {
+              do_split = 0;
+              do_rect = 0;
+            }
+          }
         }
 
 #if CONFIG_FP_MB_STATS
@@ -2760,7 +3402,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
         pc_tree->partitioning = PARTITION_SPLIT;
 
         // Rate and distortion based partition search termination clause.
-        if (!x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+        if (!cpi->sf.ml_partition_search_early_termination &&
+            !x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
                                    (best_rdc.dist < dist_breakout_thr &&
                                     best_rdc.rate < rate_breakout_thr))) {
           do_rect = 0;
@@ -2903,13 +3546,18 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
   const int mi_col_start = tile_info->mi_col_start;
   const int mi_col_end = tile_info->mi_col_end;
   int mi_col;
+  const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+  const int num_sb_cols =
+      get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2);
+  int sb_col_in_tile;
 
   // Initialize the left context for the new SB row
   memset(&xd->left_context, 0, sizeof(xd->left_context));
   memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) {
+  for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end;
+       mi_col += MI_BLOCK_SIZE, sb_col_in_tile++) {
     const struct segmentation *const seg = &cm->seg;
     int dummy_rate;
     int64_t dummy_dist;
@@ -2920,6 +3568,9 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO **mi = cm->mi_grid_visible + idx_str;
 
+    (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
+                                   sb_col_in_tile);
+
     if (sf->adaptive_pred_interp_filter) {
       for (i = 0; i < 64; ++i) td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
 
@@ -2971,6 +3622,8 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
                         &dummy_rdc, INT64_MAX, td->pc_root);
     }
+    (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
+                                    sb_col_in_tile, num_sb_cols);
   }
 }
 
@@ -3044,7 +3697,7 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) {
 static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x,
                                      RD_COST *rd_cost, BLOCK_SIZE bsize,
                                      PICK_MODE_CONTEXT *ctx) {
-  if (bsize < BLOCK_16X16)
+  if (!cpi->sf.nonrd_keyframe && bsize < BLOCK_16X16)
     vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
   else
     vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
@@ -3208,6 +3861,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       !force_horz_split && xss <= yss && bsize >= BLOCK_8X8;
   (void)*tp_orig;
 
+  // Avoid checking for rectangular partitions for speed >= 6.
+  if (cpi->oxcf.speed >= 6) do_rect = 0;
+
   assert(num_8x8_blocks_wide_lookup[bsize] ==
          num_8x8_blocks_high_lookup[bsize]);
 
@@ -3251,8 +3907,8 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       this_rdc.rdcost =
           RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
       if (this_rdc.rdcost < best_rdc.rdcost) {
-        int64_t dist_breakout_thr = sf->partition_search_breakout_dist_thr;
-        int64_t rate_breakout_thr = sf->partition_search_breakout_rate_thr;
+        int64_t dist_breakout_thr = sf->partition_search_breakout_thr.dist;
+        int64_t rate_breakout_thr = sf->partition_search_breakout_thr.rate;
 
         dist_breakout_thr >>=
             8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
@@ -3435,6 +4091,8 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
   RD_COST this_rdc;
+  BLOCK_SIZE subsize_ref =
+      (cpi->sf.adapt_partition_source_sad) ? BLOCK_8X8 : BLOCK_16X16;
 
   vp9_rd_cost_reset(&this_rdc);
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
@@ -3448,7 +4106,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
     nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost,
                          0, INT64_MAX, pc_tree);
   } else if (bsize == BLOCK_32X32 && partition != PARTITION_NONE &&
-             subsize >= BLOCK_16X16) {
+             subsize >= subsize_ref) {
     x->max_partition_size = BLOCK_32X32;
     x->min_partition_size = BLOCK_8X8;
     nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost,
@@ -3673,13 +4331,18 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
   const int mi_col_start = tile_info->mi_col_start;
   const int mi_col_end = tile_info->mi_col_end;
   int mi_col;
+  const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+  const int num_sb_cols =
+      get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2);
+  int sb_col_in_tile;
 
   // Initialize the left context for the new SB row
   memset(&xd->left_context, 0, sizeof(xd->left_context));
   memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) {
+  for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end;
+       mi_col += MI_BLOCK_SIZE, ++sb_col_in_tile) {
     const struct segmentation *const seg = &cm->seg;
     RD_COST dummy_rdc;
     const int idx_str = cm->mi_stride * mi_row + mi_col;
@@ -3687,12 +4350,29 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
     PARTITION_SEARCH_TYPE partition_search_type = sf->partition_search_type;
     BLOCK_SIZE bsize = BLOCK_64X64;
     int seg_skip = 0;
+
+    (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
+                                   sb_col_in_tile);
+
+    if (cpi->use_skin_detection) {
+      vp9_compute_skin_sb(cpi, BLOCK_16X16, mi_row, mi_col);
+    }
+
     x->source_variance = UINT_MAX;
     vp9_zero(x->pred_mv);
     vp9_rd_cost_init(&dummy_rdc);
     x->color_sensitivity[0] = 0;
     x->color_sensitivity[1] = 0;
     x->sb_is_skin = 0;
+    x->skip_low_source_sad = 0;
+    x->lowvar_highsumdiff = 0;
+    x->content_state_sb = 0;
+    x->sb_use_mv_part = 0;
+    x->sb_mvcol_part = 0;
+    x->sb_mvrow_part = 0;
+    x->sb_pickmode_part = 0;
+    x->arf_frame_usage = 0;
+    x->lastgolden_frame_usage = 0;
 
     if (seg->enabled) {
       const uint8_t *const map =
@@ -3704,6 +4384,17 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
       }
     }
 
+    if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) {
+      int shift = cpi->Source->y_stride * (mi_row << 3) + (mi_col << 3);
+      int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+      int64_t source_sad = avg_source_sad(cpi, x, shift, sb_offset2);
+      if (sf->adapt_partition_source_sad &&
+          (cpi->oxcf.rc_mode == VPX_VBR && !cpi->rc.is_src_frame_alt_ref &&
+           source_sad > sf->adapt_partition_thresh &&
+           (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)))
+        partition_search_type = REFERENCE_PARTITION;
+    }
+
     // Set the partition type of the 64X64 block
     switch (partition_search_type) {
       case VAR_BASED_PARTITION:
@@ -3727,15 +4418,14 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
                             BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         break;
       case REFERENCE_PARTITION:
+        x->sb_pickmode_part = 1;
         set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
-        // Use nonrd_pick_partition on scene-cut for VBR, or on qp-segment
-        // if cyclic_refresh is enabled.
+        // Use nonrd_pick_partition on scene-cut for VBR mode.
         // nonrd_pick_partition does not support 4x4 partition, so avoid it
         // on key frame for now.
         if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad &&
-             cm->frame_type != KEY_FRAME) ||
-            (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
-             xd->mi[0]->segment_id)) {
+             cpi->oxcf.speed < 6 && cm->frame_type != KEY_FRAME &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
           // Use lower max_partition_size for low resoultions.
           if (cm->width <= 352 && cm->height <= 288)
             x->max_partition_size = BLOCK_32X32;
@@ -3761,10 +4451,35 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
         break;
       default: assert(0); break;
     }
+
+    // Update ref_frame usage for inter frame if this group is ARF group.
+    if (!cpi->rc.is_src_frame_alt_ref && !cpi->refresh_golden_frame &&
+        !cpi->refresh_alt_ref_frame && cpi->rc.alt_ref_gf_group &&
+        cpi->sf.use_altref_onepass) {
+      int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+      if (cpi->count_arf_frame_usage != NULL)
+        cpi->count_arf_frame_usage[sboffset] = x->arf_frame_usage;
+      if (cpi->count_lastgolden_frame_usage != NULL)
+        cpi->count_lastgolden_frame_usage[sboffset] = x->lastgolden_frame_usage;
+    }
+
+    (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
+                                    sb_col_in_tile, num_sb_cols);
   }
 }
 // end RTC play code
 
+static INLINE uint32_t variance(const diff *const d) {
+  return d->sse - (uint32_t)(((int64_t)d->sum * d->sum) >> 8);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE uint32_t variance_highbd(diff *const d) {
+  const int64_t var = (int64_t)d->sse - (((int64_t)d->sum * d->sum) >> 8);
+  return (var >= 0) ? (uint32_t)var : 0;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   const VP9_COMMON *const cm = &cpi->common;
@@ -3794,14 +4509,17 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
           case VPX_BITS_8:
             vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride,
                                      &var16->sse, &var16->sum);
+            var16->var = variance(var16);
             break;
           case VPX_BITS_10:
             vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
                                       &var16->sse, &var16->sum);
+            var16->var = variance_highbd(var16);
             break;
           case VPX_BITS_12:
             vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
                                       &var16->sse, &var16->sum);
+            var16->var = variance_highbd(var16);
             break;
           default:
             assert(0 &&
@@ -3812,12 +4530,13 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
       } else {
         vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse,
                         &var16->sum);
+        var16->var = variance(var16);
       }
 #else
       vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse,
                       &var16->sum);
+      var16->var = variance(var16);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      var16->var = var16->sse - (((uint32_t)var16->sum * var16->sum) >> 8);
 
       if (var16->var >= VAR_HIST_MAX_BG_VAR)
         hist[VAR_HIST_BINS - 1]++;
@@ -3895,7 +4614,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
   const int tile_rows = 1 << cm->log2_tile_rows;
   int tile_col, tile_row;
   TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+  TOKENLIST *tplist = cpi->tplist[0][0];
   int tile_tok = 0;
+  int tplist_count = 0;
 
   if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
     if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
@@ -3910,51 +4631,75 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
         int i, j;
         for (i = 0; i < BLOCK_SIZES; ++i) {
           for (j = 0; j < MAX_MODES; ++j) {
-            tile_data->thresh_freq_fact[i][j] = 32;
+            tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
             tile_data->mode_map[i][j] = j;
           }
         }
+#if CONFIG_MULTITHREAD
+        tile_data->row_base_thresh_freq_fact = NULL;
+#endif
       }
   }
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      TileInfo *tile_info =
-          &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+      TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+      TileInfo *tile_info = &this_tile->tile_info;
       vp9_tile_init(tile_info, cm, tile_row, tile_col);
 
       cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
       pre_tok = cpi->tile_tok[tile_row][tile_col];
       tile_tok = allocated_tokens(*tile_info);
+
+      cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
+      tplist = cpi->tplist[tile_row][tile_col];
+      tplist_count = get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
     }
   }
 }
 
+void vp9_encode_sb_row(VP9_COMP *cpi, ThreadData *td, int tile_row,
+                       int tile_col, int mi_row) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+  TOKENEXTRA *tok = NULL;
+  int tile_sb_row;
+  int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 1) >> 1;
+
+  tile_sb_row = mi_cols_aligned_to_sb(mi_row - tile_info->mi_row_start) >>
+                MI_BLOCK_SIZE_LOG2;
+  get_start_tok(cpi, tile_row, tile_col, mi_row, &tok);
+  cpi->tplist[tile_row][tile_col][tile_sb_row].start = tok;
+
+  if (cpi->sf.use_nonrd_pick_mode)
+    encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+  else
+    encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+
+  cpi->tplist[tile_row][tile_col][tile_sb_row].stop = tok;
+  cpi->tplist[tile_row][tile_col][tile_sb_row].count =
+      (unsigned int)(cpi->tplist[tile_row][tile_col][tile_sb_row].stop -
+                     cpi->tplist[tile_row][tile_col][tile_sb_row].start);
+  assert(tok - cpi->tplist[tile_row][tile_col][tile_sb_row].start <=
+         get_token_alloc(MI_BLOCK_SIZE >> 1, tile_mb_cols));
+
+  (void)tile_mb_cols;
+}
+
 void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td, int tile_row,
                      int tile_col) {
   VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
   TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
   const TileInfo *const tile_info = &this_tile->tile_info;
-  TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
   const int mi_row_start = tile_info->mi_row_start;
   const int mi_row_end = tile_info->mi_row_end;
   int mi_row;
 
-  // Set up pointers to per thread motion search counters.
-  td->mb.m_search_count_ptr = &td->rd_counts.m_search_count;
-  td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
-
-  for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE) {
-    if (cpi->sf.use_nonrd_pick_mode)
-      encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
-    else
-      encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
-  }
-  cpi->tok_count[tile_row][tile_col] =
-      (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
-  assert(tok - cpi->tile_tok[tile_row][tile_col] <=
-         allocated_tokens(*tile_info));
+  for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE)
+    vp9_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
 }
 
 static void encode_tiles(VP9_COMP *cpi) {
@@ -4002,15 +4747,15 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth)
-    x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
+    x->fwd_txfm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
   else
-    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
-  x->highbd_itxm_add =
+    x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+  x->highbd_inv_txfm_add =
       xd->lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add;
 #else
-  x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+  x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+  x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
 
   if (xd->lossless) x->optimize = 0;
 
@@ -4048,6 +4793,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     vp9_zero(x->zcoeff_blk);
 
     if (cm->frame_type != KEY_FRAME && cpi->rc.frames_since_golden == 0 &&
+        !(cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) &&
         !cpi->use_svc)
       cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
 
@@ -4066,11 +4812,20 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     }
 #endif
 
-    // If allowed, encoding tiles in parallel with one thread handling one tile.
-    if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
-      vp9_encode_tiles_mt(cpi);
-    else
-      encode_tiles(cpi);
+    if (!cpi->row_mt) {
+      cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy;
+      cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy;
+      // If allowed, encoding tiles in parallel with one thread handling one
+      // tile when row based multi-threading is disabled.
+      if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
+        vp9_encode_tiles_mt(cpi);
+      else
+        encode_tiles(cpi);
+    } else {
+      cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read;
+      cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write;
+      vp9_encode_tiles_row_mt(cpi);
+    }
 
     vpx_usec_timer_mark(&emr_timer);
     cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
@@ -4243,8 +4998,31 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       }
     }
   } else {
+    FRAME_COUNTS *counts = cpi->td.counts;
     cm->reference_mode = SINGLE_REFERENCE;
+    if (cpi->allow_comp_inter_inter && cpi->sf.use_compound_nonrd_pickmode &&
+        cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref &&
+        cm->frame_type != KEY_FRAME)
+      cm->reference_mode = REFERENCE_MODE_SELECT;
+
     encode_frame_internal(cpi);
+
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      int single_count_zero = 0;
+      int comp_count_zero = 0;
+      int i;
+      for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
+        single_count_zero += counts->comp_inter[i][0];
+        comp_count_zero += counts->comp_inter[i][1];
+      }
+      if (comp_count_zero == 0) {
+        cm->reference_mode = SINGLE_REFERENCE;
+        vp9_zero(counts->comp_inter);
+      } else if (single_count_zero == 0) {
+        cm->reference_mode = COMPOUND_REFERENCE;
+        vp9_zero(counts->comp_inter);
+      }
+    }
   }
 
   // If segmented AQ is enabled compute the average AQ weighting.
diff --git a/libs/libvpx/vp9/encoder/vp9_encodeframe.h b/libs/libvpx/vp9/encoder/vp9_encodeframe.h
index aa54947858..cf5ae3d8ac 100644
--- a/libs/libvpx/vp9/encoder/vp9_encodeframe.h
+++ b/libs/libvpx/vp9/encoder/vp9_encodeframe.h
@@ -39,7 +39,11 @@ void vp9_init_tile_data(struct VP9_COMP *cpi);
 void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td, int tile_row,
                      int tile_col);
 
-void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q);
+void vp9_encode_sb_row(struct VP9_COMP *cpi, struct ThreadData *td,
+                       int tile_row, int tile_col, int mi_row);
+
+void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q,
+                                           int content_state);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libs/libvpx/vp9/encoder/vp9_encodemb.c b/libs/libvpx/vp9/encoder/vp9_encodemb.c
index 20ebe68197..f3c17f2559 100644
--- a/libs/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libs/libvpx/vp9/encoder/vp9_encodemb.c
@@ -49,36 +49,13 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
                      pd->dst.buf, pd->dst.stride);
 }
 
-typedef struct vp9_token_state {
-  int64_t error;
-  int rate;
-  int16_t next;
-  int16_t token;
-  tran_low_t qc;
-  tran_low_t dqc;
-  uint8_t best_index;
-} vp9_token_state;
-
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
   { 10, 6 }, { 8, 5 },
 };
 
-#define UPDATE_RD_COST()                             \
-  {                                                  \
-    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \
-    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \
-  }
-
-// This function is a place holder for now but may ultimately need
-// to scan previous tokens to work out the correct context.
-static int trellis_get_coeff_context(const int16_t *scan, const int16_t *nb,
-                                     int idx, int token, uint8_t *token_cache) {
-  int bak = token_cache[scan[idx]], pt;
-  token_cache[scan[idx]] = vp9_pt_energy_class[token];
-  pt = get_coef_context(nb, token_cache, idx + 1);
-  token_cache[scan[idx]] = bak;
-  return pt;
-}
+// 'num' can be negative, but 'shift' must be non-negative.
+#define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \
+  ((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift))
 
 int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                    int ctx) {
@@ -86,232 +63,244 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ref = is_inter_block(xd->mi[0]);
-  vp9_token_state tokens[1025][2];
   uint8_t token_cache[1024];
-  const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+  const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const int eob = p->eobs[block];
-  const PLANE_TYPE type = get_plane_type(plane);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
   const int default_eob = 16 << (tx_size << 1);
   const int shift = (tx_size == TX_32X32);
   const int16_t *const dequant_ptr = pd->dequant;
   const uint8_t *const band_translate = get_band_translate(tx_size);
-  const scan_order *const so = get_scan(xd, tx_size, type, block);
+  const scan_order *const so = get_scan(xd, tx_size, plane_type, block);
   const int16_t *const scan = so->scan;
   const int16_t *const nb = so->neighbors;
-  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
-  int next = eob, sz = 0;
-  const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][type]) >> 1;
+  const int64_t rdmult =
+      ((int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
   const int64_t rddiv = mb->rddiv;
   int64_t rd_cost0, rd_cost1;
-  int rate0, rate1;
-  int64_t error0, error1;
+  int64_t rate0, rate1;
   int16_t t0, t1;
-  EXTRABIT e0;
-  int best, band, pt, i, final_eob;
+  int i, final_eob;
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
 #else
-  const int *cat6_high_cost = vp9_get_high_cost_table(8);
+  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8);
 #endif
+  unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      mb->token_costs[tx_size][plane_type][ref];
+  unsigned int(*token_costs_cur)[2][COEFF_CONTEXTS][ENTROPY_TOKENS];
+  int64_t eob_cost0, eob_cost1;
+  const int ctx0 = ctx;
+  int64_t accu_rate = 0;
+  // Initialized to the worst possible error for the largest transform size.
+  // This ensures that it never goes negative.
+  int64_t accu_error = ((int64_t)1) << 50;
+  int64_t best_block_rd_cost = INT64_MAX;
+  int x_prev = 1;
+  tran_low_t before_best_eob_qc = 0;
+  tran_low_t before_best_eob_dqc = 0;
 
-  assert((!type && !plane) || (type && plane));
+  assert((!plane_type && !plane) || (plane_type && plane));
   assert(eob <= default_eob);
 
-  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-  /* Initialize the sentinel node of the trellis. */
-  tokens[eob][0].rate = 0;
-  tokens[eob][0].error = 0;
-  tokens[eob][0].next = default_eob;
-  tokens[eob][0].token = EOB_TOKEN;
-  tokens[eob][0].qc = 0;
-  tokens[eob][1] = tokens[eob][0];
-
-  for (i = 0; i < eob; i++)
-    token_cache[scan[i]] = vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])];
-
-  for (i = eob; i-- > 0;) {
-    int base_bits, d2, dx;
+  for (i = 0; i < eob; i++) {
     const int rc = scan[i];
-    int x = qcoeff[rc];
-    /* Only add a trellis state for non-zero coefficients. */
-    if (x) {
-      int shortcut = 0;
-      error0 = tokens[next][0].error;
-      error1 = tokens[next][1].error;
-      /* Evaluate the first possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-      vp9_get_token_extra(x, &t0, &e0);
-      /* Consider both possible successor states. */
-      if (next < default_eob) {
-        band = band_translate[i + 1];
-        pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-        rate0 += mb->token_costs[tx_size][type][ref][band][0][pt]
-                                [tokens[next][0].token];
-        rate1 += mb->token_costs[tx_size][type][ref][band][0][pt]
-                                [tokens[next][1].token];
-      }
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
-      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx >>= xd->bd - 8;
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      d2 = dx * dx;
-      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][0].error = d2 + (best ? error1 : error0);
-      tokens[i][0].next = next;
-      tokens[i][0].token = t0;
-      tokens[i][0].qc = x;
-      tokens[i][0].dqc = dqcoeff[rc];
-      tokens[i][0].best_index = best;
+    token_cache[rc] = vp9_pt_energy_class[vp9_get_token(qcoeff[rc])];
+  }
+  final_eob = 0;
 
-      /* Evaluate the second possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
+  // Initial RD cost.
+  token_costs_cur = token_costs + band_translate[0];
+  rate0 = (*token_costs_cur)[0][ctx0][EOB_TOKEN];
+  best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error);
 
-      if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
-          (abs(x) * dequant_ptr[rc != 0] <
-           (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0]))
-        shortcut = 1;
-      else
-        shortcut = 0;
-
-      if (shortcut) {
-        sz = -(x < 0);
-        x -= 2 * sz + 1;
-      } else {
-        tokens[i][1] = tokens[i][0];
-        next = i;
-        continue;
-      }
-
-      /* Consider both possible successor states. */
-      if (!x) {
-        /* If we reduced this coefficient to zero, check to see if
-         *  we need to move the EOB back here.
-         */
-        t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
-        t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
-        e0 = 0;
-      } else {
-        vp9_get_token_extra(x, &t0, &e0);
-        t1 = t0;
-      }
-      if (next < default_eob) {
-        band = band_translate[i + 1];
-        if (t0 != EOB_TOKEN) {
-          pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-          rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
-                                  [tokens[next][0].token];
-        }
-        if (t1 != EOB_TOKEN) {
-          pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
-          rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
-                                  [tokens[next][1].token];
-        }
-      }
-
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
-
-      if (shortcut) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
-        } else {
-          dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-        }
-#else
-        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-        d2 = dx * dx;
-      }
-      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][1].error = d2 + (best ? error1 : error0);
-      tokens[i][1].next = next;
-      tokens[i][1].token = best ? t1 : t0;
-      tokens[i][1].qc = x;
-
-      if (x) {
-        tran_low_t offset = dq_step[rc != 0];
-        // The 32x32 transform coefficient uses half quantization step size.
-        // Account for the rounding difference in the dequantized coefficeint
-        // value when the quantization index is dropped from an even number
-        // to an odd number.
-        if (shift & x) offset += (dequant_ptr[rc != 0] & 0x01);
-
-        if (sz == 0)
-          tokens[i][1].dqc = dqcoeff[rc] - offset;
-        else
-          tokens[i][1].dqc = dqcoeff[rc] + offset;
-      } else {
-        tokens[i][1].dqc = 0;
-      }
-
-      tokens[i][1].best_index = best;
-      /* Finally, make this the new head of the trellis. */
-      next = i;
+  // For each token, pick one of two choices greedily:
+  // (i) First candidate: Keep current quantized value, OR
+  // (ii) Second candidate: Reduce quantized value by 1.
+  for (i = 0; i < eob; i++) {
+    const int rc = scan[i];
+    const int x = qcoeff[rc];
+    const int band_cur = band_translate[i];
+    const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i);
+    const int token_tree_sel_cur = (x_prev == 0);
+    token_costs_cur = token_costs + band_cur;
+    if (x == 0) {  // No need to search
+      const int token = vp9_get_token(x);
+      rate0 = (*token_costs_cur)[token_tree_sel_cur][ctx_cur][token];
+      accu_rate += rate0;
+      x_prev = 0;
+      // Note: accu_error does not change.
     } else {
-      /* There's no choice to make for a zero coefficient, so we don't
-       *  add a new trellis node, but we do need to update the costs.
-       */
-      band = band_translate[i + 1];
-      pt = get_coef_context(nb, token_cache, i + 1);
-      t0 = tokens[next][0].token;
-      t1 = tokens[next][1].token;
-      /* Update the cost of each path if we're past the EOB token. */
-      if (t0 != EOB_TOKEN) {
-        tokens[next][0].rate +=
-            mb->token_costs[tx_size][type][ref][band][1][pt][t0];
-        tokens[next][0].token = ZERO_TOKEN;
+      const int dqv = dequant_ptr[rc != 0];
+      // Compute the distortion for quantizing to 0.
+      const int diff_for_zero_raw = (0 - coeff[rc]) * (1 << shift);
+      const int diff_for_zero =
+#if CONFIG_VP9_HIGHBITDEPTH
+          (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+              ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff_for_zero_raw, xd->bd - 8)
+              :
+#endif
+              diff_for_zero_raw;
+      const int64_t distortion_for_zero =
+          (int64_t)diff_for_zero * diff_for_zero;
+
+      // Compute the distortion for the first candidate
+      const int diff0_raw = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+      const int diff0 =
+#if CONFIG_VP9_HIGHBITDEPTH
+          (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+              ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff0_raw, xd->bd - 8)
+              :
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+              diff0_raw;
+      const int64_t distortion0 = (int64_t)diff0 * diff0;
+
+      // Compute the distortion for the second candidate
+      const int sign = -(x < 0);        // -1 if x is negative and 0 otherwise.
+      const int x1 = x - 2 * sign - 1;  // abs(x1) = abs(x) - 1.
+      int64_t distortion1;
+      if (x1 != 0) {
+        const int dqv_step =
+#if CONFIG_VP9_HIGHBITDEPTH
+            (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? dqv >> (xd->bd - 8)
+                                                          :
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                                                          dqv;
+        const int diff_step = (dqv_step + sign) ^ sign;
+        const int diff1 = diff0 - diff_step;
+        assert(dqv > 0);  // We aren't right shifting a negative number above.
+        distortion1 = (int64_t)diff1 * diff1;
+      } else {
+        distortion1 = distortion_for_zero;
       }
-      if (t1 != EOB_TOKEN) {
-        tokens[next][1].rate +=
-            mb->token_costs[tx_size][type][ref][band][1][pt][t1];
-        tokens[next][1].token = ZERO_TOKEN;
+      {
+        // Calculate RDCost for current coeff for the two candidates.
+        const int64_t base_bits0 = vp9_get_token_cost(x, &t0, cat6_high_cost);
+        const int64_t base_bits1 = vp9_get_token_cost(x1, &t1, cat6_high_cost);
+        rate0 =
+            base_bits0 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t0];
+        rate1 =
+            base_bits1 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t1];
+      }
+      {
+        int rdcost_better_for_x1, eob_rdcost_better_for_x1;
+        int dqc0, dqc1;
+        int64_t best_eob_cost_cur;
+        int use_x1;
+
+        // Calculate RD Cost effect on the next coeff for the two candidates.
+        int64_t next_bits0 = 0;
+        int64_t next_bits1 = 0;
+        int64_t next_eob_bits0 = 0;
+        int64_t next_eob_bits1 = 0;
+        if (i < default_eob - 1) {
+          int ctx_next, token_tree_sel_next;
+          const int band_next = band_translate[i + 1];
+          const int token_next =
+              (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN;
+          unsigned int(
+              *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+              token_costs + band_next;
+          token_cache[rc] = vp9_pt_energy_class[t0];
+          ctx_next = get_coef_context(nb, token_cache, i + 1);
+          token_tree_sel_next = (x == 0);
+          next_bits0 =
+              (*token_costs_next)[token_tree_sel_next][ctx_next][token_next];
+          next_eob_bits0 =
+              (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
+          token_cache[rc] = vp9_pt_energy_class[t1];
+          ctx_next = get_coef_context(nb, token_cache, i + 1);
+          token_tree_sel_next = (x1 == 0);
+          next_bits1 =
+              (*token_costs_next)[token_tree_sel_next][ctx_next][token_next];
+          if (x1 != 0) {
+            next_eob_bits1 =
+                (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
+          }
+        }
+
+        // Compare the total RD costs for two candidates.
+        rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), distortion0);
+        rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), distortion1);
+        rdcost_better_for_x1 = (rd_cost1 < rd_cost0);
+        eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0),
+                           (accu_error + distortion0 - distortion_for_zero));
+        eob_cost1 = eob_cost0;
+        if (x1 != 0) {
+          eob_cost1 =
+              RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1),
+                     (accu_error + distortion1 - distortion_for_zero));
+          eob_rdcost_better_for_x1 = (eob_cost1 < eob_cost0);
+        } else {
+          eob_rdcost_better_for_x1 = 0;
+        }
+
+        // Calculate the two candidate de-quantized values.
+        dqc0 = dqcoeff[rc];
+        dqc1 = 0;
+        if (rdcost_better_for_x1 + eob_rdcost_better_for_x1) {
+          if (x1 != 0) {
+            dqc1 = RIGHT_SHIFT_POSSIBLY_NEGATIVE(x1 * dqv, shift);
+          } else {
+            dqc1 = 0;
+          }
+        }
+
+        // Pick and record the better quantized and de-quantized values.
+        if (rdcost_better_for_x1) {
+          qcoeff[rc] = x1;
+          dqcoeff[rc] = dqc1;
+          accu_rate += rate1;
+          accu_error += distortion1 - distortion_for_zero;
+          assert(distortion1 <= distortion_for_zero);
+          token_cache[rc] = vp9_pt_energy_class[t1];
+        } else {
+          accu_rate += rate0;
+          accu_error += distortion0 - distortion_for_zero;
+          assert(distortion0 <= distortion_for_zero);
+          token_cache[rc] = vp9_pt_energy_class[t0];
+        }
+        assert(accu_error >= 0);
+        x_prev = qcoeff[rc];  // Update based on selected quantized value.
+
+        use_x1 = (x1 != 0) && eob_rdcost_better_for_x1;
+        best_eob_cost_cur = use_x1 ? eob_cost1 : eob_cost0;
+
+        // Determine whether to move the eob position to i+1
+        if (best_eob_cost_cur < best_block_rd_cost) {
+          best_block_rd_cost = best_eob_cost_cur;
+          final_eob = i + 1;
+          if (use_x1) {
+            before_best_eob_qc = x1;
+            before_best_eob_dqc = dqc1;
+          } else {
+            before_best_eob_qc = x;
+            before_best_eob_dqc = dqc0;
+          }
+        }
       }
-      tokens[i][0].best_index = tokens[i][1].best_index = 0;
-      /* Don't update next, because we didn't add a new node. */
     }
   }
-
-  /* Now pick the best path through the whole trellis. */
-  band = band_translate[i + 1];
-  rate0 = tokens[next][0].rate;
-  rate1 = tokens[next][1].rate;
-  error0 = tokens[next][0].error;
-  error1 = tokens[next][1].error;
-  t0 = tokens[next][0].token;
-  t1 = tokens[next][1].token;
-  rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0];
-  rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1];
-  UPDATE_RD_COST();
-  best = rd_cost1 < rd_cost0;
-  final_eob = -1;
-
-  for (i = next; i < eob; i = next) {
-    const int x = tokens[i][best].qc;
-    const int rc = scan[i];
-    if (x) final_eob = i;
-    qcoeff[rc] = x;
-    dqcoeff[rc] = tokens[i][best].dqc;
-    next = tokens[i][best].next;
-    best = tokens[i][best].best_index;
+  assert(final_eob <= eob);
+  if (final_eob > 0) {
+    int rc;
+    assert(before_best_eob_qc != 0);
+    i = final_eob - 1;
+    rc = scan[i];
+    qcoeff[rc] = before_best_eob_qc;
+    dqcoeff[rc] = before_best_eob_dqc;
+  }
+  for (i = final_eob; i < eob; i++) {
+    int rc = scan[i];
+    qcoeff[rc] = 0;
+    dqcoeff[rc] = 0;
   }
-  final_eob++;
-
   mb->plane[plane].eobs[block] = final_eob;
   return final_eob;
 }
+#undef RIGHT_SHIFT_POSSIBLY_NEGATIVE
 
 static INLINE void fdct32x32(int rd_transform, const int16_t *src,
                              tran_low_t *dst, int src_stride) {
@@ -344,37 +333,35 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
   src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                     p->round_fp, p->quant_fp, p->quant_shift,
-                                     qcoeff, dqcoeff, pd->dequant, eob,
-                                     scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->round_fp,
+                                     p->quant_fp, qcoeff, dqcoeff, pd->dequant,
+                                     eob, scan_order->scan, scan_order->iscan);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob, scan_order->scan,
-                               scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->round_fp,
+                               p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob, scan_order->scan,
-                               scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->round_fp,
+                               p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
         break;
       case TX_4X4:
-        x->fwd_txm4x4(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob, scan_order->scan,
-                               scan_order->iscan);
+        x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp,
+                               p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order->scan, scan_order->iscan);
         break;
       default: assert(0);
     }
@@ -385,28 +372,26 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
-                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+      vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->round_fp,
+                            p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                      eob, scan_order->scan, scan_order->iscan);
+      vp9_quantize_fp(coeff, 256, x->skip_block, p->round_fp, p->quant_fp,
+                      qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                      scan_order->iscan);
       break;
     case TX_8X8:
       vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64, x->skip_block,
-                        p->zbin, p->round_fp, p->quant_fp, p->quant_shift,
-                        qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                        scan_order->iscan);
+                        p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant,
+                        eob, scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
-      x->fwd_txm4x4(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                      eob, scan_order->scan, scan_order->iscan);
+      x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+      vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
+                      qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                      scan_order->iscan);
       break;
     default: assert(0); break;
   }
@@ -424,6 +409,9 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
   src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     switch (tx_size) {
@@ -446,7 +434,7 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
                                eob);
         break;
       case TX_4X4:
-        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        x->fwd_txfm4x4(src_diff, coeff, diff_stride);
         vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
                                p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
                                eob);
@@ -474,7 +462,7 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
                       qcoeff, dqcoeff, pd->dequant[0], eob);
       break;
     case TX_4X4:
-      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+      x->fwd_txfm4x4(src_diff, coeff, diff_stride);
       vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0],
                       qcoeff, dqcoeff, pd->dequant[0], eob);
       break;
@@ -495,6 +483,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
   src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -521,7 +511,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
                               scan_order->iscan);
         break;
       case TX_4X4:
-        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        x->fwd_txfm4x4(src_diff, coeff, diff_stride);
         vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                               p->quant, p->quant_shift, qcoeff, dqcoeff,
                               pd->dequant, eob, scan_order->scan,
@@ -554,7 +544,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
                      scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
-      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+      x->fwd_txfm4x4(src_diff, coeff, diff_stride);
       vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
                      p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
@@ -629,25 +619,26 @@ static void encode_block(int plane, int block, int row, int col,
   if (x->skip_encode || p->eobs[block] == 0) return;
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
     switch (tx_size) {
       case TX_32X32:
-        vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+        vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
                                  xd->bd);
         break;
       case TX_16X16:
-        vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+        vp9_highbd_idct16x16_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
                                  xd->bd);
         break;
       case TX_8X8:
-        vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+        vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
                                xd->bd);
         break;
       case TX_4X4:
         // this is like vp9_short_idct4x4 but has a special case around eob<=1
         // which is significant (not just an optimization) for the lossless
         // case.
-        x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                           xd->bd);
+        x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
+                               xd->bd);
         break;
       default: assert(0 && "Invalid transform size");
     }
@@ -669,7 +660,7 @@ static void encode_block(int plane, int block, int row, int col,
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
-      x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+      x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
     default: assert(0 && "Invalid transform size"); break;
   }
@@ -691,11 +682,12 @@ static void encode_block_pass1(int plane, int block, int row, int col,
   if (p->eobs[block] > 0) {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd);
+      x->highbd_inv_txfm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride,
+                             p->eobs[block], xd->bd);
       return;
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+    x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
   }
 }
 
@@ -784,12 +776,17 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
     }
   }
 
-  vp9_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst,
-                          x->skip_encode ? src_stride : dst_stride, dst,
-                          dst_stride, col, row, plane);
+  vp9_predict_intra_block(
+      xd, bwl, tx_size, mode, (x->skip_encode || x->fp_src_pred) ? src : dst,
+      (x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst,
+      dst_stride, col, row, plane);
+
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
     switch (tx_size) {
       case TX_32X32:
         if (!x->skip_recode) {
@@ -801,8 +798,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                                       qcoeff, dqcoeff, pd->dequant, eob,
                                       scan_order->scan, scan_order->iscan);
         }
+        if (args->enable_coeff_opt && !x->skip_recode) {
+          *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+        }
         if (!x->skip_encode && *eob) {
-          vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+          vp9_highbd_idct32x32_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);
         }
         break;
       case TX_16X16:
@@ -818,8 +818,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                                 pd->dequant, eob, scan_order->scan,
                                 scan_order->iscan);
         }
+        if (args->enable_coeff_opt && !x->skip_recode) {
+          *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+        }
         if (!x->skip_encode && *eob) {
-          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob,
+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, dst_stride, *eob,
                                   xd->bd);
         }
         break;
@@ -836,8 +839,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                                 pd->dequant, eob, scan_order->scan,
                                 scan_order->iscan);
         }
+        if (args->enable_coeff_opt && !x->skip_recode) {
+          *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+        }
         if (!x->skip_encode && *eob) {
-          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob,
+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, dst_stride, *eob,
                                 xd->bd);
         }
         break;
@@ -848,21 +854,24 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           if (tx_type != DCT_DCT)
             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
           else
-            x->fwd_txm4x4(src_diff, coeff, diff_stride);
+            x->fwd_txfm4x4(src_diff, coeff, diff_stride);
           vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
                                 pd->dequant, eob, scan_order->scan,
                                 scan_order->iscan);
         }
-
+        if (args->enable_coeff_opt && !x->skip_recode) {
+          *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+        }
         if (!x->skip_encode && *eob) {
           if (tx_type == DCT_DCT) {
             // this is like vp9_short_idct4x4 but has a special case around
             // eob<=1 which is significant (not just an optimization) for the
             // lossless case.
-            x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+            x->highbd_inv_txfm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);
           } else {
-            vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd);
+            vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type,
+                                     xd->bd);
           }
         }
         break;
@@ -927,7 +936,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         if (tx_type != DCT_DCT)
           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
         else
-          x->fwd_txm4x4(src_diff, coeff, diff_stride);
+          x->fwd_txfm4x4(src_diff, coeff, diff_stride);
         vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
                        scan_order->scan, scan_order->iscan);
@@ -940,7 +949,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
           // case.
-          x->itxm_add(dqcoeff, dst, dst_stride, *eob);
+          x->inv_txfm_add(dqcoeff, dst, dst_stride, *eob);
         else
           vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
       }
diff --git a/libs/libvpx/vp9/encoder/vp9_encodemv.c b/libs/libvpx/vp9/encoder/vp9_encodemv.c
index 874a8e4b98..023d087c2c 100644
--- a/libs/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/libs/libvpx/vp9/encoder/vp9_encodemv.c
@@ -208,7 +208,8 @@ void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w,
 }
 
 void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref,
-                   const nmv_context *mvctx, int usehp) {
+                   const nmv_context *mvctx, int usehp,
+                   unsigned int *const max_mv_magnitude) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);
   usehp = usehp && use_mv_hp(ref);
@@ -223,8 +224,8 @@ void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref,
   // If auto_mv_step_size is enabled then keep track of the largest
   // motion vector component used.
   if (cpi->sf.mv.auto_mv_step_size) {
-    unsigned int maxv = VPXMAX(abs(mv->row), abs(mv->col)) >> 3;
-    cpi->max_mv_magnitude = VPXMAX(maxv, cpi->max_mv_magnitude);
+    const unsigned int maxv = VPXMAX(abs(mv->row), abs(mv->col)) >> 3;
+    *max_mv_magnitude = VPXMAX(maxv, *max_mv_magnitude);
   }
 }
 
diff --git a/libs/libvpx/vp9/encoder/vp9_encodemv.h b/libs/libvpx/vp9/encoder/vp9_encodemv.h
index ad77b8154f..9fc7ab8dc4 100644
--- a/libs/libvpx/vp9/encoder/vp9_encodemv.h
+++ b/libs/libvpx/vp9/encoder/vp9_encodemv.h
@@ -23,7 +23,8 @@ void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w,
                          nmv_context_counts *const counts);
 
 void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref,
-                   const nmv_context *mvctx, int usehp);
+                   const nmv_context *mvctx, int usehp,
+                   unsigned int *const max_mv_magnitude);
 
 void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
                               const nmv_context *mvctx, int usehp);
diff --git a/libs/libvpx/vp9/encoder/vp9_encoder.c b/libs/libvpx/vp9/encoder/vp9_encoder.c
index e8f5ec55ab..2ae59dd981 100644
--- a/libs/libvpx/vp9/encoder/vp9_encoder.c
+++ b/libs/libvpx/vp9/encoder/vp9_encoder.c
@@ -50,6 +50,7 @@
 #include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mbgraph.h"
+#include "vp9/encoder/vp9_multi_thread.h"
 #include "vp9/encoder/vp9_noise_estimate.h"
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vp9/encoder/vp9_ratectrl.h"
@@ -70,13 +71,15 @@
                                        // mv. Choose a very high value for
                                        // now so that HIGH_PRECISION is always
                                        // chosen.
-// #define OUTPUT_YUV_REC
+
+#define FRAME_SIZE_FACTOR 128  // empirical params for context model threshold
+#define FRAME_RATE_FACTOR 8
 
 #ifdef OUTPUT_YUV_DENOISED
 FILE *yuv_denoised_file = NULL;
 #endif
 #ifdef OUTPUT_YUV_SKINMAP
-FILE *yuv_skinmap_file = NULL;
+static FILE *yuv_skinmap_file = NULL;
 #endif
 #ifdef OUTPUT_YUV_REC
 FILE *yuv_rec_file;
@@ -99,6 +102,331 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) {
 }
 #endif
 
+// compute adaptive threshold for skip recoding
+static int compute_context_model_thresh(const VP9_COMP *const cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frame_size = (cm->width * cm->height) >> 10;
+  const int bitrate = (int)(oxcf->target_bandwidth >> 10);
+  const int qindex_factor = cm->base_qindex + (MAXQ >> 1);
+
+  // This equation makes the threshold adaptive to frame size.
+  // Coding gain obtained by recoding comes from alternate frames of large
+  // content change. We skip recoding if the difference of previous and current
+  // frame context probability model is less than a certain threshold.
+  // The first component is the most critical part to guarantee adaptivity.
+  // Other parameters are estimated based on normal setting of hd resolution
+  // parameters. e.g frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50
+  const int thresh =
+      ((FRAME_SIZE_FACTOR * frame_size - FRAME_RATE_FACTOR * bitrate) *
+       qindex_factor) >>
+      9;
+
+  return thresh;
+}
+
+// compute the total cost difference between current
+// and previous frame context prob model.
+static int compute_context_model_diff(const VP9_COMMON *const cm) {
+  const FRAME_CONTEXT *const pre_fc =
+      &cm->frame_contexts[cm->frame_context_idx];
+  const FRAME_CONTEXT *const cur_fc = cm->fc;
+  const FRAME_COUNTS *counts = &cm->counts;
+  vpx_prob pre_last_prob, cur_last_prob;
+  int diff = 0;
+  int i, j, k, l, m, n;
+
+  // y_mode_prob
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+    for (j = 0; j < INTRA_MODES - 1; ++j) {
+      diff += (int)counts->y_mode[i][j] *
+              (pre_fc->y_mode_prob[i][j] - cur_fc->y_mode_prob[i][j]);
+    }
+    pre_last_prob = MAX_PROB - pre_fc->y_mode_prob[i][INTRA_MODES - 2];
+    cur_last_prob = MAX_PROB - cur_fc->y_mode_prob[i][INTRA_MODES - 2];
+
+    diff += (int)counts->y_mode[i][INTRA_MODES - 1] *
+            (pre_last_prob - cur_last_prob);
+  }
+
+  // uv_mode_prob
+  for (i = 0; i < INTRA_MODES; ++i) {
+    for (j = 0; j < INTRA_MODES - 1; ++j) {
+      diff += (int)counts->uv_mode[i][j] *
+              (pre_fc->uv_mode_prob[i][j] - cur_fc->uv_mode_prob[i][j]);
+    }
+    pre_last_prob = MAX_PROB - pre_fc->uv_mode_prob[i][INTRA_MODES - 2];
+    cur_last_prob = MAX_PROB - cur_fc->uv_mode_prob[i][INTRA_MODES - 2];
+
+    diff += (int)counts->uv_mode[i][INTRA_MODES - 1] *
+            (pre_last_prob - cur_last_prob);
+  }
+
+  // partition_prob
+  for (i = 0; i < PARTITION_CONTEXTS; ++i) {
+    for (j = 0; j < PARTITION_TYPES - 1; ++j) {
+      diff += (int)counts->partition[i][j] *
+              (pre_fc->partition_prob[i][j] - cur_fc->partition_prob[i][j]);
+    }
+    pre_last_prob = MAX_PROB - pre_fc->partition_prob[i][PARTITION_TYPES - 2];
+    cur_last_prob = MAX_PROB - cur_fc->partition_prob[i][PARTITION_TYPES - 2];
+
+    diff += (int)counts->partition[i][PARTITION_TYPES - 1] *
+            (pre_last_prob - cur_last_prob);
+  }
+
+  // coef_probs
+  for (i = 0; i < TX_SIZES; ++i) {
+    for (j = 0; j < PLANE_TYPES; ++j) {
+      for (k = 0; k < REF_TYPES; ++k) {
+        for (l = 0; l < COEF_BANDS; ++l) {
+          for (m = 0; m < BAND_COEFF_CONTEXTS(l); ++m) {
+            for (n = 0; n < UNCONSTRAINED_NODES; ++n) {
+              diff += (int)counts->coef[i][j][k][l][m][n] *
+                      (pre_fc->coef_probs[i][j][k][l][m][n] -
+                       cur_fc->coef_probs[i][j][k][l][m][n]);
+            }
+
+            pre_last_prob =
+                MAX_PROB -
+                pre_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1];
+            cur_last_prob =
+                MAX_PROB -
+                cur_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1];
+
+            diff += (int)counts->coef[i][j][k][l][m][UNCONSTRAINED_NODES] *
+                    (pre_last_prob - cur_last_prob);
+          }
+        }
+      }
+    }
+  }
+
+  // switchable_interp_prob
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) {
+    for (j = 0; j < SWITCHABLE_FILTERS - 1; ++j) {
+      diff += (int)counts->switchable_interp[i][j] *
+              (pre_fc->switchable_interp_prob[i][j] -
+               cur_fc->switchable_interp_prob[i][j]);
+    }
+    pre_last_prob =
+        MAX_PROB - pre_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2];
+    cur_last_prob =
+        MAX_PROB - cur_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2];
+
+    diff += (int)counts->switchable_interp[i][SWITCHABLE_FILTERS - 1] *
+            (pre_last_prob - cur_last_prob);
+  }
+
+  // inter_mode_probs
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+    for (j = 0; j < INTER_MODES - 1; ++j) {
+      diff += (int)counts->inter_mode[i][j] *
+              (pre_fc->inter_mode_probs[i][j] - cur_fc->inter_mode_probs[i][j]);
+    }
+    pre_last_prob = MAX_PROB - pre_fc->inter_mode_probs[i][INTER_MODES - 2];
+    cur_last_prob = MAX_PROB - cur_fc->inter_mode_probs[i][INTER_MODES - 2];
+
+    diff += (int)counts->inter_mode[i][INTER_MODES - 1] *
+            (pre_last_prob - cur_last_prob);
+  }
+
+  // intra_inter_prob
+  for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
+    diff += (int)counts->intra_inter[i][0] *
+            (pre_fc->intra_inter_prob[i] - cur_fc->intra_inter_prob[i]);
+
+    pre_last_prob = MAX_PROB - pre_fc->intra_inter_prob[i];
+    cur_last_prob = MAX_PROB - cur_fc->intra_inter_prob[i];
+
+    diff += (int)counts->intra_inter[i][1] * (pre_last_prob - cur_last_prob);
+  }
+
+  // comp_inter_prob
+  for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
+    diff += (int)counts->comp_inter[i][0] *
+            (pre_fc->comp_inter_prob[i] - cur_fc->comp_inter_prob[i]);
+
+    pre_last_prob = MAX_PROB - pre_fc->comp_inter_prob[i];
+    cur_last_prob = MAX_PROB - cur_fc->comp_inter_prob[i];
+
+    diff += (int)counts->comp_inter[i][1] * (pre_last_prob - cur_last_prob);
+  }
+
+  // single_ref_prob
+  for (i = 0; i < REF_CONTEXTS; ++i) {
+    for (j = 0; j < 2; ++j) {
+      diff += (int)counts->single_ref[i][j][0] *
+              (pre_fc->single_ref_prob[i][j] - cur_fc->single_ref_prob[i][j]);
+
+      pre_last_prob = MAX_PROB - pre_fc->single_ref_prob[i][j];
+      cur_last_prob = MAX_PROB - cur_fc->single_ref_prob[i][j];
+
+      diff +=
+          (int)counts->single_ref[i][j][1] * (pre_last_prob - cur_last_prob);
+    }
+  }
+
+  // comp_ref_prob
+  for (i = 0; i < REF_CONTEXTS; ++i) {
+    diff += (int)counts->comp_ref[i][0] *
+            (pre_fc->comp_ref_prob[i] - cur_fc->comp_ref_prob[i]);
+
+    pre_last_prob = MAX_PROB - pre_fc->comp_ref_prob[i];
+    cur_last_prob = MAX_PROB - cur_fc->comp_ref_prob[i];
+
+    diff += (int)counts->comp_ref[i][1] * (pre_last_prob - cur_last_prob);
+  }
+
+  // tx_probs
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+    // p32x32
+    for (j = 0; j < TX_SIZES - 1; ++j) {
+      diff += (int)counts->tx.p32x32[i][j] *
+              (pre_fc->tx_probs.p32x32[i][j] - cur_fc->tx_probs.p32x32[i][j]);
+    }
+    pre_last_prob = MAX_PROB - pre_fc->tx_probs.p32x32[i][TX_SIZES - 2];
+    cur_last_prob = MAX_PROB - cur_fc->tx_probs.p32x32[i][TX_SIZES - 2];
+
+    diff += (int)counts->tx.p32x32[i][TX_SIZES - 1] *
+            (pre_last_prob - cur_last_prob);
+
+    // p16x16
+    for (j = 0; j < TX_SIZES - 2; ++j) {
+      diff += (int)counts->tx.p16x16[i][j] *
+              (pre_fc->tx_probs.p16x16[i][j] - cur_fc->tx_probs.p16x16[i][j]);
+    }
+    pre_last_prob = MAX_PROB - pre_fc->tx_probs.p16x16[i][TX_SIZES - 3];
+    cur_last_prob = MAX_PROB - cur_fc->tx_probs.p16x16[i][TX_SIZES - 3];
+
+    diff += (int)counts->tx.p16x16[i][TX_SIZES - 2] *
+            (pre_last_prob - cur_last_prob);
+
+    // p8x8
+    for (j = 0; j < TX_SIZES - 3; ++j) {
+      diff += (int)counts->tx.p8x8[i][j] *
+              (pre_fc->tx_probs.p8x8[i][j] - cur_fc->tx_probs.p8x8[i][j]);
+    }
+    pre_last_prob = MAX_PROB - pre_fc->tx_probs.p8x8[i][TX_SIZES - 4];
+    cur_last_prob = MAX_PROB - cur_fc->tx_probs.p8x8[i][TX_SIZES - 4];
+
+    diff +=
+        (int)counts->tx.p8x8[i][TX_SIZES - 3] * (pre_last_prob - cur_last_prob);
+  }
+
+  // skip_probs
+  for (i = 0; i < SKIP_CONTEXTS; ++i) {
+    diff += (int)counts->skip[i][0] *
+            (pre_fc->skip_probs[i] - cur_fc->skip_probs[i]);
+
+    pre_last_prob = MAX_PROB - pre_fc->skip_probs[i];
+    cur_last_prob = MAX_PROB - cur_fc->skip_probs[i];
+
+    diff += (int)counts->skip[i][1] * (pre_last_prob - cur_last_prob);
+  }
+
+  // mv
+  for (i = 0; i < MV_JOINTS - 1; ++i) {
+    diff += (int)counts->mv.joints[i] *
+            (pre_fc->nmvc.joints[i] - cur_fc->nmvc.joints[i]);
+  }
+  pre_last_prob = MAX_PROB - pre_fc->nmvc.joints[MV_JOINTS - 2];
+  cur_last_prob = MAX_PROB - cur_fc->nmvc.joints[MV_JOINTS - 2];
+
+  diff +=
+      (int)counts->mv.joints[MV_JOINTS - 1] * (pre_last_prob - cur_last_prob);
+
+  for (i = 0; i < 2; ++i) {
+    const nmv_component_counts *nmv_count = &counts->mv.comps[i];
+    const nmv_component *pre_nmv_prob = &pre_fc->nmvc.comps[i];
+    const nmv_component *cur_nmv_prob = &cur_fc->nmvc.comps[i];
+
+    // sign
+    diff += (int)nmv_count->sign[0] * (pre_nmv_prob->sign - cur_nmv_prob->sign);
+
+    pre_last_prob = MAX_PROB - pre_nmv_prob->sign;
+    cur_last_prob = MAX_PROB - cur_nmv_prob->sign;
+
+    diff += (int)nmv_count->sign[1] * (pre_last_prob - cur_last_prob);
+
+    // classes
+    for (j = 0; j < MV_CLASSES - 1; ++j) {
+      diff += (int)nmv_count->classes[j] *
+              (pre_nmv_prob->classes[j] - cur_nmv_prob->classes[j]);
+    }
+    pre_last_prob = MAX_PROB - pre_nmv_prob->classes[MV_CLASSES - 2];
+    cur_last_prob = MAX_PROB - cur_nmv_prob->classes[MV_CLASSES - 2];
+
+    diff += (int)nmv_count->classes[MV_CLASSES - 1] *
+            (pre_last_prob - cur_last_prob);
+
+    // class0
+    for (j = 0; j < CLASS0_SIZE - 1; ++j) {
+      diff += (int)nmv_count->class0[j] *
+              (pre_nmv_prob->class0[j] - cur_nmv_prob->class0[j]);
+    }
+    pre_last_prob = MAX_PROB - pre_nmv_prob->class0[CLASS0_SIZE - 2];
+    cur_last_prob = MAX_PROB - cur_nmv_prob->class0[CLASS0_SIZE - 2];
+
+    diff += (int)nmv_count->class0[CLASS0_SIZE - 1] *
+            (pre_last_prob - cur_last_prob);
+
+    // bits
+    for (j = 0; j < MV_OFFSET_BITS; ++j) {
+      diff += (int)nmv_count->bits[j][0] *
+              (pre_nmv_prob->bits[j] - cur_nmv_prob->bits[j]);
+
+      pre_last_prob = MAX_PROB - pre_nmv_prob->bits[j];
+      cur_last_prob = MAX_PROB - cur_nmv_prob->bits[j];
+
+      diff += (int)nmv_count->bits[j][1] * (pre_last_prob - cur_last_prob);
+    }
+
+    // class0_fp
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      for (k = 0; k < MV_FP_SIZE - 1; ++k) {
+        diff += (int)nmv_count->class0_fp[j][k] *
+                (pre_nmv_prob->class0_fp[j][k] - cur_nmv_prob->class0_fp[j][k]);
+      }
+      pre_last_prob = MAX_PROB - pre_nmv_prob->class0_fp[j][MV_FP_SIZE - 2];
+      cur_last_prob = MAX_PROB - cur_nmv_prob->class0_fp[j][MV_FP_SIZE - 2];
+
+      diff += (int)nmv_count->class0_fp[j][MV_FP_SIZE - 1] *
+              (pre_last_prob - cur_last_prob);
+    }
+
+    // fp
+    for (j = 0; j < MV_FP_SIZE - 1; ++j) {
+      diff +=
+          (int)nmv_count->fp[j] * (pre_nmv_prob->fp[j] - cur_nmv_prob->fp[j]);
+    }
+    pre_last_prob = MAX_PROB - pre_nmv_prob->fp[MV_FP_SIZE - 2];
+    cur_last_prob = MAX_PROB - cur_nmv_prob->fp[MV_FP_SIZE - 2];
+
+    diff +=
+        (int)nmv_count->fp[MV_FP_SIZE - 1] * (pre_last_prob - cur_last_prob);
+
+    // class0_hp
+    diff += (int)nmv_count->class0_hp[0] *
+            (pre_nmv_prob->class0_hp - cur_nmv_prob->class0_hp);
+
+    pre_last_prob = MAX_PROB - pre_nmv_prob->class0_hp;
+    cur_last_prob = MAX_PROB - cur_nmv_prob->class0_hp;
+
+    diff += (int)nmv_count->class0_hp[1] * (pre_last_prob - cur_last_prob);
+
+    // hp
+    diff += (int)nmv_count->hp[0] * (pre_nmv_prob->hp - cur_nmv_prob->hp);
+
+    pre_last_prob = MAX_PROB - pre_nmv_prob->hp;
+    cur_last_prob = MAX_PROB - cur_nmv_prob->hp;
+
+    diff += (int)nmv_count->hp[1] * (pre_last_prob - cur_last_prob);
+  }
+
+  return -diff;
+}
+
 // Test for whether to calculate metrics for the frame.
 static int is_psnr_calc_enabled(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
@@ -108,26 +436,39 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) {
 }
 
 /* clang-format off */
-static const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
-  { LEVEL_1,   829440,      36864,    200,    400,   2, 1,  4,  8 },
-  { LEVEL_1_1, 2764800,     73728,    800,    1000,  2, 1,  4,  8 },
-  { LEVEL_2,   4608000,     122880,   1800,   1500,  2, 1,  4,  8 },
-  { LEVEL_2_1, 9216000,     245760,   3600,   2800,  2, 2,  4,  8 },
-  { LEVEL_3,   20736000,    552960,   7200,   6000,  2, 4,  4,  8 },
-  { LEVEL_3_1, 36864000,    983040,   12000,  10000, 2, 4,  4,  8 },
-  { LEVEL_4,   83558400,    2228224,  18000,  16000, 4, 4,  4,  8 },
-  { LEVEL_4_1, 160432128,   2228224,  30000,  18000, 4, 4,  5,  6 },
-  { LEVEL_5,   311951360,   8912896,  60000,  36000, 6, 8,  6,  4 },
-  { LEVEL_5_1, 588251136,   8912896,  120000, 46000, 8, 8,  10, 4 },
+const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
+  //         sample rate    size   breadth  bitrate  cpb
+  { LEVEL_1,   829440,      36864,    512,   200,    400,    2, 1,  4,  8 },
+  { LEVEL_1_1, 2764800,     73728,    768,   800,    1000,   2, 1,  4,  8 },
+  { LEVEL_2,   4608000,     122880,   960,   1800,   1500,   2, 1,  4,  8 },
+  { LEVEL_2_1, 9216000,     245760,   1344,  3600,   2800,   2, 2,  4,  8 },
+  { LEVEL_3,   20736000,    552960,   2048,  7200,   6000,   2, 4,  4,  8 },
+  { LEVEL_3_1, 36864000,    983040,   2752,  12000,  10000,  2, 4,  4,  8 },
+  { LEVEL_4,   83558400,    2228224,  4160,  18000,  16000,  4, 4,  4,  8 },
+  { LEVEL_4_1, 160432128,   2228224,  4160,  30000,  18000,  4, 4,  5,  6 },
+  { LEVEL_5,   311951360,   8912896,  8384,  60000,  36000,  6, 8,  6,  4 },
+  { LEVEL_5_1, 588251136,   8912896,  8384,  120000, 46000,  8, 8,  10, 4 },
   // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when
-  // they are finalized (currently TBD).
-  { LEVEL_5_2, 1176502272,  8912896,  180000, 0,     8, 8,  10, 4 },
-  { LEVEL_6,   1176502272,  35651584, 180000, 0,     8, 16, 10, 4 },
-  { LEVEL_6_1, 2353004544u, 35651584, 240000, 0,     8, 16, 10, 4 },
-  { LEVEL_6_2, 4706009088u, 35651584, 480000, 0,     8, 16, 10, 4 },
+  // they are finalized (currently tentative).
+  { LEVEL_5_2, 1176502272,  8912896,  8384,  180000, 90000,  8, 8,  10, 4 },
+  { LEVEL_6,   1176502272,  35651584, 16832, 180000, 90000,  8, 16, 10, 4 },
+  { LEVEL_6_1, 2353004544u, 35651584, 16832, 240000, 180000, 8, 16, 10, 4 },
+  { LEVEL_6_2, 4706009088u, 35651584, 16832, 480000, 360000, 8, 16, 10, 4 },
 };
 /* clang-format on */
 
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
+  "The average bit-rate is too high.",
+  "The picture size is too large.",
+  "The picture width/height is too large.",
+  "The luma sample rate is too large.",
+  "The CPB size is too large.",
+  "The compression ratio is too small",
+  "Too many column tiles are used.",
+  "The alt-ref distance is too small.",
+  "Too many reference buffers are used."
+};
+
 static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
   switch (mode) {
     case NORMAL:
@@ -224,9 +565,12 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
 
   for (i = 0; i < VP9_LEVELS; ++i) {
     this_level = &vp9_level_defs[i];
-    if ((double)level_spec->max_luma_sample_rate * (1 + SAMPLE_RATE_GRACE_P) >
-            (double)this_level->max_luma_sample_rate ||
+    if ((double)level_spec->max_luma_sample_rate >
+            (double)this_level->max_luma_sample_rate *
+                (1 + SAMPLE_RATE_GRACE_P) ||
         level_spec->max_luma_picture_size > this_level->max_luma_picture_size ||
+        level_spec->max_luma_picture_breadth >
+            this_level->max_luma_picture_breadth ||
         level_spec->average_bitrate > this_level->average_bitrate ||
         level_spec->max_cpb_size > this_level->max_cpb_size ||
         level_spec->compression_ratio < this_level->compression_ratio ||
@@ -399,7 +743,9 @@ void vp9_initialize_enc(void) {
     vp9_init_me_luts();
     vp9_rc_init_minq_luts();
     vp9_entropy_mv_init();
+#if !CONFIG_REALTIME_ONLY
     vp9_temporal_filter_init();
+#endif
     init_done = 1;
   }
 }
@@ -439,6 +785,32 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   cpi->nmvsadcosts_hp[0] = NULL;
   cpi->nmvsadcosts_hp[1] = NULL;
 
+  vpx_free(cpi->skin_map);
+  cpi->skin_map = NULL;
+
+  vpx_free(cpi->prev_partition);
+  cpi->prev_partition = NULL;
+
+  vpx_free(cpi->svc.prev_partition_svc);
+  cpi->svc.prev_partition_svc = NULL;
+
+  vpx_free(cpi->prev_segment_id);
+  cpi->prev_segment_id = NULL;
+
+  vpx_free(cpi->prev_variance_low);
+  cpi->prev_variance_low = NULL;
+
+  vpx_free(cpi->copied_frame_cnt);
+  cpi->copied_frame_cnt = NULL;
+
+  vpx_free(cpi->content_state_sb_fd);
+  cpi->content_state_sb_fd = NULL;
+
+  vpx_free(cpi->count_arf_frame_usage);
+  cpi->count_arf_frame_usage = NULL;
+  vpx_free(cpi->count_lastgolden_frame_usage);
+  cpi->count_lastgolden_frame_usage = NULL;
+
   vp9_cyclic_refresh_free(cpi->cyclic_refresh);
   cpi->cyclic_refresh = NULL;
 
@@ -468,6 +840,9 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vpx_free(cpi->tile_tok[0][0]);
   cpi->tile_tok[0][0] = 0;
 
+  vpx_free(cpi->tplist[0][0]);
+  cpi->tplist[0][0] = NULL;
+
   vp9_free_pc_tree(&cpi->td);
 
   for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
@@ -553,6 +928,7 @@ static void restore_coding_context(VP9_COMP *cpi) {
   *cm->fc = cc->fc;
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void configure_static_seg_features(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
@@ -676,6 +1052,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
     }
   }
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void update_reference_segmentation_map(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
@@ -802,6 +1179,7 @@ static int alloc_context_buffers_ext(VP9_COMP *cpi) {
 
 static void alloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
+  int sb_rows;
 
   vp9_alloc_context_buffers(cm, cm->width, cm->height);
 
@@ -815,6 +1193,12 @@ static void alloc_compressor_data(VP9_COMP *cpi) {
                     vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
   }
 
+  sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  vpx_free(cpi->tplist[0][0]);
+  CHECK_MEM_ERROR(
+      cm, cpi->tplist[0][0],
+      vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0])));
+
   vp9_setup_pc_tree(&cpi->common, &cpi->td);
 }
 
@@ -838,6 +1222,14 @@ static void set_tile_limits(VP9_COMP *cpi) {
         clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
     cm->log2_tile_rows = cpi->oxcf.tile_rows;
   }
+
+  if (cpi->oxcf.target_level == LEVEL_AUTO) {
+    const int level_tile_cols =
+        log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
+    if (cm->log2_tile_cols > level_tile_cols) {
+      cm->log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
+    }
+  }
 }
 
 static void update_frame_size(VP9_COMP *cpi) {
@@ -872,6 +1264,22 @@ static void init_buffer_indices(VP9_COMP *cpi) {
   cpi->alt_fb_idx = 2;
 }
 
+static void init_level_constraint(LevelConstraint *lc) {
+  lc->level_index = -1;
+  lc->max_cpb_size = INT_MAX;
+  lc->max_frame_size = INT_MAX;
+  lc->rc_config_updated = 0;
+  lc->fail_flag = 0;
+}
+
+static void set_level_constraint(LevelConstraint *ls, int8_t level_index) {
+  vpx_clear_system_state();
+  ls->level_index = level_index;
+  if (level_index >= 0) {
+    ls->max_cpb_size = vp9_level_defs[level_index].max_cpb_size * (double)1000;
+  }
+}
+
 static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
 
@@ -887,6 +1295,8 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
 
   cpi->target_level = oxcf->target_level;
   cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX;
+  set_level_constraint(&cpi->level_constraint,
+                       get_level_index(cpi->target_level));
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
@@ -935,14 +1345,12 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
-  cpi->fn_ptr[BT].sdf = SDF;                                           \
-  cpi->fn_ptr[BT].sdaf = SDAF;                                         \
-  cpi->fn_ptr[BT].vf = VF;                                             \
-  cpi->fn_ptr[BT].svf = SVF;                                           \
-  cpi->fn_ptr[BT].svaf = SVAF;                                         \
-  cpi->fn_ptr[BT].sdx3f = SDX3F;                                       \
-  cpi->fn_ptr[BT].sdx8f = SDX8F;                                       \
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                             \
+  cpi->fn_ptr[BT].sdaf = SDAF;                           \
+  cpi->fn_ptr[BT].vf = VF;                               \
+  cpi->fn_ptr[BT].svf = SVF;                             \
+  cpi->fn_ptr[BT].svaf = SVAF;                           \
   cpi->fn_ptr[BT].sdx4df = SDX4DF;
 
 #define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
@@ -981,47 +1389,6 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
            4;                                                                  \
   }
 
-#define MAKE_BFP_SAD3_WRAPPER(fnname)                                    \
-  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,  \
-                             const uint8_t *ref_ptr, int ref_stride,     \
-                             unsigned int *sad_array) {                  \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-  }                                                                      \
-  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 3; i++) sad_array[i] >>= 2;                          \
-  }                                                                      \
-  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 3; i++) sad_array[i] >>= 4;                          \
-  }
-
-#define MAKE_BFP_SAD8_WRAPPER(fnname)                                    \
-  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,  \
-                             const uint8_t *ref_ptr, int ref_stride,     \
-                             unsigned int *sad_array) {                  \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-  }                                                                      \
-  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 8; i++) sad_array[i] >>= 2;                          \
-  }                                                                      \
-  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 8; i++) sad_array[i] >>= 4;                          \
-  }
 #define MAKE_BFP_SAD4D_WRAPPER(fnname)                                        \
   static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
                              const uint8_t *const ref_ptr[], int ref_stride,  \
@@ -1057,46 +1424,30 @@ MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
 
 static void highbd_set_var_fns(VP9_COMP *const cpi) {
@@ -1107,253 +1458,236 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
         HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8,
                    vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16,
                    vpx_highbd_8_sub_pixel_variance32x16,
-                   vpx_highbd_8_sub_pixel_avg_variance32x16, NULL, NULL,
+                   vpx_highbd_8_sub_pixel_avg_variance32x16,
                    vpx_highbd_sad32x16x4d_bits8)
 
         HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8,
                    vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32,
                    vpx_highbd_8_sub_pixel_variance16x32,
-                   vpx_highbd_8_sub_pixel_avg_variance16x32, NULL, NULL,
+                   vpx_highbd_8_sub_pixel_avg_variance16x32,
                    vpx_highbd_sad16x32x4d_bits8)
 
         HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8,
                    vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32,
                    vpx_highbd_8_sub_pixel_variance64x32,
-                   vpx_highbd_8_sub_pixel_avg_variance64x32, NULL, NULL,
+                   vpx_highbd_8_sub_pixel_avg_variance64x32,
                    vpx_highbd_sad64x32x4d_bits8)
 
         HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8,
                    vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64,
                    vpx_highbd_8_sub_pixel_variance32x64,
-                   vpx_highbd_8_sub_pixel_avg_variance32x64, NULL, NULL,
+                   vpx_highbd_8_sub_pixel_avg_variance32x64,
                    vpx_highbd_sad32x64x4d_bits8)
 
         HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8,
                    vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32,
                    vpx_highbd_8_sub_pixel_variance32x32,
                    vpx_highbd_8_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x3_bits8, vpx_highbd_sad32x32x8_bits8,
                    vpx_highbd_sad32x32x4d_bits8)
 
         HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8,
                    vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64,
                    vpx_highbd_8_sub_pixel_variance64x64,
                    vpx_highbd_8_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x3_bits8, vpx_highbd_sad64x64x8_bits8,
                    vpx_highbd_sad64x64x4d_bits8)
 
         HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8,
                    vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16,
                    vpx_highbd_8_sub_pixel_variance16x16,
                    vpx_highbd_8_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x3_bits8, vpx_highbd_sad16x16x8_bits8,
                    vpx_highbd_sad16x16x4d_bits8)
 
-        HIGHBD_BFP(
-            BLOCK_16X8, vpx_highbd_sad16x8_bits8, vpx_highbd_sad16x8_avg_bits8,
-            vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8,
-            vpx_highbd_8_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits8,
-            vpx_highbd_sad16x8x8_bits8, vpx_highbd_sad16x8x4d_bits8)
+        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8,
+                   vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8,
+                   vpx_highbd_8_sub_pixel_variance16x8,
+                   vpx_highbd_8_sub_pixel_avg_variance16x8,
+                   vpx_highbd_sad16x8x4d_bits8)
 
-        HIGHBD_BFP(
-            BLOCK_8X16, vpx_highbd_sad8x16_bits8, vpx_highbd_sad8x16_avg_bits8,
-            vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16,
-            vpx_highbd_8_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits8,
-            vpx_highbd_sad8x16x8_bits8, vpx_highbd_sad8x16x4d_bits8)
+        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8,
+                   vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16,
+                   vpx_highbd_8_sub_pixel_variance8x16,
+                   vpx_highbd_8_sub_pixel_avg_variance8x16,
+                   vpx_highbd_sad8x16x4d_bits8)
 
         HIGHBD_BFP(
             BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
             vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
-            vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits8,
-            vpx_highbd_sad8x8x8_bits8, vpx_highbd_sad8x8x4d_bits8)
+            vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x4d_bits8)
 
-        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8,
-                   vpx_highbd_sad8x4_avg_bits8, vpx_highbd_8_variance8x4,
-                   vpx_highbd_8_sub_pixel_variance8x4,
-                   vpx_highbd_8_sub_pixel_avg_variance8x4, NULL,
-                   vpx_highbd_sad8x4x8_bits8, vpx_highbd_sad8x4x4d_bits8)
+        HIGHBD_BFP(
+            BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
+            vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
+            vpx_highbd_8_sub_pixel_avg_variance8x4, vpx_highbd_sad8x4x4d_bits8)
 
-        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8,
-                   vpx_highbd_sad4x8_avg_bits8, vpx_highbd_8_variance4x8,
-                   vpx_highbd_8_sub_pixel_variance4x8,
-                   vpx_highbd_8_sub_pixel_avg_variance4x8, NULL,
-                   vpx_highbd_sad4x8x8_bits8, vpx_highbd_sad4x8x4d_bits8)
+        HIGHBD_BFP(
+            BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
+            vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
+            vpx_highbd_8_sub_pixel_avg_variance4x8, vpx_highbd_sad4x8x4d_bits8)
 
         HIGHBD_BFP(
             BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
             vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
-            vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits8,
-            vpx_highbd_sad4x4x8_bits8, vpx_highbd_sad4x4x4d_bits8)
+            vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits8)
         break;
 
       case VPX_BITS_10:
         HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10,
                    vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16,
                    vpx_highbd_10_sub_pixel_variance32x16,
-                   vpx_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL,
+                   vpx_highbd_10_sub_pixel_avg_variance32x16,
                    vpx_highbd_sad32x16x4d_bits10)
 
         HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10,
                    vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32,
                    vpx_highbd_10_sub_pixel_variance16x32,
-                   vpx_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL,
+                   vpx_highbd_10_sub_pixel_avg_variance16x32,
                    vpx_highbd_sad16x32x4d_bits10)
 
         HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10,
                    vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32,
                    vpx_highbd_10_sub_pixel_variance64x32,
-                   vpx_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL,
+                   vpx_highbd_10_sub_pixel_avg_variance64x32,
                    vpx_highbd_sad64x32x4d_bits10)
 
         HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10,
                    vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64,
                    vpx_highbd_10_sub_pixel_variance32x64,
-                   vpx_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL,
+                   vpx_highbd_10_sub_pixel_avg_variance32x64,
                    vpx_highbd_sad32x64x4d_bits10)
 
         HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10,
                    vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32,
                    vpx_highbd_10_sub_pixel_variance32x32,
                    vpx_highbd_10_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x3_bits10, vpx_highbd_sad32x32x8_bits10,
                    vpx_highbd_sad32x32x4d_bits10)
 
         HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10,
                    vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64,
                    vpx_highbd_10_sub_pixel_variance64x64,
                    vpx_highbd_10_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x3_bits10, vpx_highbd_sad64x64x8_bits10,
                    vpx_highbd_sad64x64x4d_bits10)
 
         HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10,
                    vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16,
                    vpx_highbd_10_sub_pixel_variance16x16,
                    vpx_highbd_10_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x3_bits10, vpx_highbd_sad16x16x8_bits10,
                    vpx_highbd_sad16x16x4d_bits10)
 
         HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10,
                    vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8,
                    vpx_highbd_10_sub_pixel_variance16x8,
                    vpx_highbd_10_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x3_bits10, vpx_highbd_sad16x8x8_bits10,
                    vpx_highbd_sad16x8x4d_bits10)
 
         HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10,
                    vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16,
                    vpx_highbd_10_sub_pixel_variance8x16,
                    vpx_highbd_10_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x3_bits10, vpx_highbd_sad8x16x8_bits10,
                    vpx_highbd_sad8x16x4d_bits10)
 
-        HIGHBD_BFP(
-            BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad8x8_avg_bits10,
-            vpx_highbd_10_variance8x8, vpx_highbd_10_sub_pixel_variance8x8,
-            vpx_highbd_10_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits10,
-            vpx_highbd_sad8x8x8_bits10, vpx_highbd_sad8x8x4d_bits10)
+        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10,
+                   vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
+                   vpx_highbd_10_sub_pixel_variance8x8,
+                   vpx_highbd_10_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x4d_bits10)
 
         HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10,
                    vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
                    vpx_highbd_10_sub_pixel_variance8x4,
-                   vpx_highbd_10_sub_pixel_avg_variance8x4, NULL,
-                   vpx_highbd_sad8x4x8_bits10, vpx_highbd_sad8x4x4d_bits10)
+                   vpx_highbd_10_sub_pixel_avg_variance8x4,
+                   vpx_highbd_sad8x4x4d_bits10)
 
         HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10,
                    vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
                    vpx_highbd_10_sub_pixel_variance4x8,
-                   vpx_highbd_10_sub_pixel_avg_variance4x8, NULL,
-                   vpx_highbd_sad4x8x8_bits10, vpx_highbd_sad4x8x4d_bits10)
+                   vpx_highbd_10_sub_pixel_avg_variance4x8,
+                   vpx_highbd_sad4x8x4d_bits10)
 
-        HIGHBD_BFP(
-            BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad4x4_avg_bits10,
-            vpx_highbd_10_variance4x4, vpx_highbd_10_sub_pixel_variance4x4,
-            vpx_highbd_10_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits10,
-            vpx_highbd_sad4x4x8_bits10, vpx_highbd_sad4x4x4d_bits10)
+        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10,
+                   vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
+                   vpx_highbd_10_sub_pixel_variance4x4,
+                   vpx_highbd_10_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x4d_bits10)
         break;
 
       case VPX_BITS_12:
         HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12,
                    vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16,
                    vpx_highbd_12_sub_pixel_variance32x16,
-                   vpx_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL,
+                   vpx_highbd_12_sub_pixel_avg_variance32x16,
                    vpx_highbd_sad32x16x4d_bits12)
 
         HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12,
                    vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32,
                    vpx_highbd_12_sub_pixel_variance16x32,
-                   vpx_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL,
+                   vpx_highbd_12_sub_pixel_avg_variance16x32,
                    vpx_highbd_sad16x32x4d_bits12)
 
         HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12,
                    vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32,
                    vpx_highbd_12_sub_pixel_variance64x32,
-                   vpx_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL,
+                   vpx_highbd_12_sub_pixel_avg_variance64x32,
                    vpx_highbd_sad64x32x4d_bits12)
 
         HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12,
                    vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64,
                    vpx_highbd_12_sub_pixel_variance32x64,
-                   vpx_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL,
+                   vpx_highbd_12_sub_pixel_avg_variance32x64,
                    vpx_highbd_sad32x64x4d_bits12)
 
         HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12,
                    vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32,
                    vpx_highbd_12_sub_pixel_variance32x32,
                    vpx_highbd_12_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x3_bits12, vpx_highbd_sad32x32x8_bits12,
                    vpx_highbd_sad32x32x4d_bits12)
 
         HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12,
                    vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64,
                    vpx_highbd_12_sub_pixel_variance64x64,
                    vpx_highbd_12_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x3_bits12, vpx_highbd_sad64x64x8_bits12,
                    vpx_highbd_sad64x64x4d_bits12)
 
         HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12,
                    vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16,
                    vpx_highbd_12_sub_pixel_variance16x16,
                    vpx_highbd_12_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x3_bits12, vpx_highbd_sad16x16x8_bits12,
                    vpx_highbd_sad16x16x4d_bits12)
 
         HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12,
                    vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8,
                    vpx_highbd_12_sub_pixel_variance16x8,
                    vpx_highbd_12_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x3_bits12, vpx_highbd_sad16x8x8_bits12,
                    vpx_highbd_sad16x8x4d_bits12)
 
         HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12,
                    vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16,
                    vpx_highbd_12_sub_pixel_variance8x16,
                    vpx_highbd_12_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x3_bits12, vpx_highbd_sad8x16x8_bits12,
                    vpx_highbd_sad8x16x4d_bits12)
 
-        HIGHBD_BFP(
-            BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad8x8_avg_bits12,
-            vpx_highbd_12_variance8x8, vpx_highbd_12_sub_pixel_variance8x8,
-            vpx_highbd_12_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits12,
-            vpx_highbd_sad8x8x8_bits12, vpx_highbd_sad8x8x4d_bits12)
+        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12,
+                   vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
+                   vpx_highbd_12_sub_pixel_variance8x8,
+                   vpx_highbd_12_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x4d_bits12)
 
         HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12,
                    vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
                    vpx_highbd_12_sub_pixel_variance8x4,
-                   vpx_highbd_12_sub_pixel_avg_variance8x4, NULL,
-                   vpx_highbd_sad8x4x8_bits12, vpx_highbd_sad8x4x4d_bits12)
+                   vpx_highbd_12_sub_pixel_avg_variance8x4,
+                   vpx_highbd_sad8x4x4d_bits12)
 
         HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12,
                    vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
                    vpx_highbd_12_sub_pixel_variance4x8,
-                   vpx_highbd_12_sub_pixel_avg_variance4x8, NULL,
-                   vpx_highbd_sad4x8x8_bits12, vpx_highbd_sad4x8x4d_bits12)
+                   vpx_highbd_12_sub_pixel_avg_variance4x8,
+                   vpx_highbd_sad4x8x4d_bits12)
 
-        HIGHBD_BFP(
-            BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad4x4_avg_bits12,
-            vpx_highbd_12_variance4x4, vpx_highbd_12_sub_pixel_variance4x4,
-            vpx_highbd_12_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits12,
-            vpx_highbd_sad4x4x8_bits12, vpx_highbd_sad4x4x4d_bits12)
+        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12,
+                   vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
+                   vpx_highbd_12_sub_pixel_variance4x4,
+                   vpx_highbd_12_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x4d_bits12)
         break;
 
       default:
@@ -1390,6 +1724,33 @@ static void realloc_segmentation_maps(VP9_COMP *cpi) {
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 }
 
+static void alloc_copy_partition_data(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (cpi->prev_partition == NULL) {
+    CHECK_MEM_ERROR(cm, cpi->prev_partition,
+                    (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
+                                             sizeof(*cpi->prev_partition)));
+  }
+  if (cpi->prev_segment_id == NULL) {
+    CHECK_MEM_ERROR(
+        cm, cpi->prev_segment_id,
+        (int8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+                             sizeof(*cpi->prev_segment_id)));
+  }
+  if (cpi->prev_variance_low == NULL) {
+    CHECK_MEM_ERROR(cm, cpi->prev_variance_low,
+                    (uint8_t *)vpx_calloc(
+                        (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * 25,
+                        sizeof(*cpi->prev_variance_low)));
+  }
+  if (cpi->copied_frame_cnt == NULL) {
+    CHECK_MEM_ERROR(
+        cm, cpi->copied_frame_cnt,
+        (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+                              sizeof(*cpi->copied_frame_cnt)));
+  }
+}
+
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -1403,6 +1764,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 
   cpi->target_level = oxcf->target_level;
   cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX;
+  set_level_constraint(&cpi->level_constraint,
+                       get_level_index(cpi->target_level));
 
   if (cm->profile <= PROFILE_1)
     assert(cm->bit_depth == VPX_BITS_8);
@@ -1481,13 +1844,17 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     }
   }
 
-  update_frame_size(cpi);
+  if (cm->current_video_frame == 0 || last_w != cpi->oxcf.width ||
+      last_h != cpi->oxcf.height)
+    update_frame_size(cpi);
 
   if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
     memset(cpi->consec_zero_mv, 0,
            cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_reset_resize(cpi);
+    rc->rc_1_frame = 0;
+    rc->rc_2_frame = 0;
   }
 
   if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
@@ -1498,6 +1865,24 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
                                            (int)cpi->oxcf.target_bandwidth);
   }
 
+  // Check for resetting the rc flags (rc_1_frame, rc_2_frame) if the
+  // configuration change has a large change in avg_frame_bandwidth.
+  // For SVC check for resetting based on spatial layer average bandwidth.
+  // Also reset buffer level to optimal level.
+  if (cm->current_video_frame > 0) {
+    if (cpi->use_svc) {
+      vp9_svc_check_reset_layer_rc_flag(cpi);
+    } else {
+      if (rc->avg_frame_bandwidth > (3 * rc->last_avg_frame_bandwidth >> 1) ||
+          rc->avg_frame_bandwidth < (rc->last_avg_frame_bandwidth >> 1)) {
+        rc->rc_1_frame = 0;
+        rc->rc_2_frame = 0;
+        rc->bits_off_target = rc->optimal_buffer_level;
+        rc->buffer_level = rc->optimal_buffer_level;
+      }
+    }
+  }
+
   cpi->alt_ref_source = NULL;
   rc->is_src_frame_alt_ref = 0;
 
@@ -1515,6 +1900,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
 #endif
+
+  vp9_set_row_mt(cpi);
 }
 
 #ifndef M_LOG2_E
@@ -1612,7 +1999,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
       (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
 
   cpi->use_svc = 0;
-  cpi->resize_state = 0;
+  cpi->resize_state = ORIG;
   cpi->external_resize = 0;
   cpi->resize_avg_qp = 0;
   cpi->resize_buffer_underflow = 0;
@@ -1630,6 +2017,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
   realloc_segmentation_maps(cpi);
 
+  CHECK_MEM_ERROR(cm, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                                sizeof(cpi->skin_map[0])));
+
   CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
 
   CHECK_MEM_ERROR(
@@ -1677,6 +2067,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 
   init_level_info(&cpi->level_info);
+  init_level_constraint(&cpi->level_constraint);
 
 #if CONFIG_INTERNAL_STATS
   cpi->b_calculate_blockiness = 1;
@@ -1745,7 +2136,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 #endif
 #endif
 #ifdef OUTPUT_YUV_SKINMAP
-  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+  yuv_skinmap_file = fopen("skinmap.yuv", "wb");
 #endif
 #ifdef OUTPUT_YUV_REC
   yuv_rec_file = fopen("rec.yuv", "wb");
@@ -1758,6 +2149,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
 
+#if !CONFIG_REALTIME_ONLY
   if (oxcf->pass == 1) {
     vp9_init_first_pass(cpi);
   } else if (oxcf->pass == 2) {
@@ -1822,6 +2214,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
       vp9_init_second_pass(cpi);
     }
   }
+#endif  // !CONFIG_REALTIME_ONLY
 
   vp9_set_speed_features_framesize_independent(cpi);
   vp9_set_speed_features_framesize_dependent(cpi);
@@ -1831,67 +2224,61 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
   cpi->source_var_thresh = 0;
   cpi->frames_till_next_var_check = 0;
 
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
-  cpi->fn_ptr[BT].sdf = SDF;                                    \
-  cpi->fn_ptr[BT].sdaf = SDAF;                                  \
-  cpi->fn_ptr[BT].vf = VF;                                      \
-  cpi->fn_ptr[BT].svf = SVF;                                    \
-  cpi->fn_ptr[BT].svaf = SVAF;                                  \
-  cpi->fn_ptr[BT].sdx3f = SDX3F;                                \
-  cpi->fn_ptr[BT].sdx8f = SDX8F;                                \
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                      \
+  cpi->fn_ptr[BT].sdaf = SDAF;                    \
+  cpi->fn_ptr[BT].vf = VF;                        \
+  cpi->fn_ptr[BT].svf = SVF;                      \
+  cpi->fn_ptr[BT].svaf = SVAF;                    \
   cpi->fn_ptr[BT].sdx4df = SDX4DF;
 
   BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
-      vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16, NULL, NULL,
+      vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16,
       vpx_sad32x16x4d)
 
   BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
-      vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32, NULL, NULL,
+      vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32,
       vpx_sad16x32x4d)
 
   BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
-      vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32, NULL, NULL,
+      vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32,
       vpx_sad64x32x4d)
 
   BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
-      vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64, NULL, NULL,
+      vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64,
       vpx_sad32x64x4d)
 
   BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
       vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
-      vpx_sad32x32x3, vpx_sad32x32x8, vpx_sad32x32x4d)
+      vpx_sad32x32x4d)
 
   BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
       vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
-      vpx_sad64x64x3, vpx_sad64x64x8, vpx_sad64x64x4d)
+      vpx_sad64x64x4d)
 
   BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
       vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
-      vpx_sad16x16x3, vpx_sad16x16x8, vpx_sad16x16x4d)
+      vpx_sad16x16x4d)
 
   BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
-      vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x3,
-      vpx_sad16x8x8, vpx_sad16x8x4d)
+      vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8,
+      vpx_sad16x8x4d)
 
   BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
-      vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x3,
-      vpx_sad8x16x8, vpx_sad8x16x4d)
+      vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16,
+      vpx_sad8x16x4d)
 
   BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
-      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x3,
-      vpx_sad8x8x8, vpx_sad8x8x4d)
+      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d)
 
   BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
-      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, NULL,
-      vpx_sad8x4x8, vpx_sad8x4x4d)
+      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d)
 
   BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
-      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, NULL,
-      vpx_sad4x8x8, vpx_sad4x8x4d)
+      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d)
 
   BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
-      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x3,
-      vpx_sad4x4x8, vpx_sad4x4x4d)
+      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d)
 
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
@@ -1958,16 +2345,20 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
         snprintf(headings, sizeof(headings),
                  "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
                  "VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
-                 "WstPsnr\tWstSsim\tWstFast\tWstHVS");
+                 "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+                 "AVPsnrY\tAPsnrCb\tAPsnrCr");
         snprintf(results, sizeof(results),
                  "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
                  "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                 "%7.3f\t%7.3f\t%7.3f\t%7.3f",
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f",
                  dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
                  cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr, total_ssim,
                  totalp_ssim, cpi->fastssim.stat[ALL] / cpi->count,
                  cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst,
-                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst);
+                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst,
+                 cpi->psnr.stat[Y] / cpi->count, cpi->psnr.stat[U] / cpi->count,
+                 cpi->psnr.stat[V] / cpi->count);
 
         if (cpi->b_calculate_blockiness) {
           SNPRINT(headings, "\t  Block\tWstBlck");
@@ -2027,8 +2418,12 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
   }
   vpx_free(cpi->tile_thr_data);
   vpx_free(cpi->workers);
+  vp9_row_mt_mem_dealloc(cpi);
 
-  if (cpi->num_workers > 1) vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+  if (cpi->num_workers > 1) {
+    vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+    vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+  }
 
   vp9_alt_ref_aq_destroy(cpi->alt_ref_aq);
 
@@ -2136,7 +2531,7 @@ int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
                            YV12_BUFFER_CONFIG *sd) {
   YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
   if (cfg) {
-    vp8_yv12_copy_frame(cfg, sd);
+    vpx_yv12_copy_frame(cfg, sd);
     return 0;
   } else {
     return -1;
@@ -2147,7 +2542,7 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
                           YV12_BUFFER_CONFIG *sd) {
   YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
   if (cfg) {
-    vp8_yv12_copy_frame(sd, cfg);
+    vpx_yv12_copy_frame(sd, cfg);
     return 0;
   } else {
     return -1;
@@ -2160,38 +2555,6 @@ int vp9_update_entropy(VP9_COMP *cpi, int update) {
   return 0;
 }
 
-#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
-// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
-// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
-// not denoise the UV channels at this time. If ever we implement UV channel
-// denoising we will have to modify this.
-void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
-  uint8_t *src = s->y_buffer;
-  int h = s->y_height;
-
-  do {
-    fwrite(src, s->y_width, 1, f);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, f);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, f);
-    src += s->uv_stride;
-  } while (--h);
-}
-#endif
-
 #ifdef OUTPUT_YUV_REC
 void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
   YV12_BUFFER_CONFIG *s = cm->frame_to_show;
@@ -2297,7 +2660,9 @@ static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                   YV12_BUFFER_CONFIG *dst, int bd) {
+                                   YV12_BUFFER_CONFIG *dst, int bd,
+                                   INTERP_FILTER filter_type,
+                                   int phase_scaler) {
   const int src_w = src->y_crop_width;
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
@@ -2307,7 +2672,7 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
   uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
   const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
-  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+  const InterpKernel *const kernel = vp9_filter_kernels[filter_type];
   int x, y, i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -2315,24 +2680,24 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
     const int src_stride = src_strides[i];
     const int dst_stride = dst_strides[i];
     for (y = 0; y < dst_h; y += 16) {
-      const int y_q4 = y * (16 / factor) * src_h / dst_h;
+      const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
       for (x = 0; x < dst_w; x += 16) {
-        const int x_q4 = x * (16 / factor) * src_w / dst_w;
+        const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
         const uint8_t *src_ptr = srcs[i] +
                                  (y / factor) * src_h / dst_h * src_stride +
                                  (x / factor) * src_w / dst_w;
         uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
 
         if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
-          vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
-                               kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                               kernel[y_q4 & 0xf], 16 * src_h / dst_h,
-                               16 / factor, 16 / factor, bd);
+          vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+                               CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, kernel,
+                               x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                               16 * src_h / dst_h, 16 / factor, 16 / factor,
+                               bd);
         } else {
-          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
-                        kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                        kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor,
-                        16 / factor);
+          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                        x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                        16 * src_h / dst_h, 16 / factor, 16 / factor);
         }
       }
     }
@@ -2340,44 +2705,6 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
 
   vpx_extend_frame_borders(dst);
 }
-#else
-void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
-                                  YV12_BUFFER_CONFIG *dst) {
-  const int src_w = src->y_crop_width;
-  const int src_h = src->y_crop_height;
-  const int dst_w = dst->y_crop_width;
-  const int dst_h = dst->y_crop_height;
-  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
-                                   src->v_buffer };
-  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
-  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
-  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
-  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
-  int x, y, i;
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    const int factor = (i == 0 || i == 3 ? 1 : 2);
-    const int src_stride = src_strides[i];
-    const int dst_stride = dst_strides[i];
-    for (y = 0; y < dst_h; y += 16) {
-      const int y_q4 = y * (16 / factor) * src_h / dst_h;
-      for (x = 0; x < dst_w; x += 16) {
-        const int x_q4 = x * (16 / factor) * src_w / dst_w;
-        const uint8_t *src_ptr = srcs[i] +
-                                 (y / factor) * src_h / dst_h * src_stride +
-                                 (x / factor) * src_w / dst_w;
-        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
-
-        vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
-                      kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                      kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor,
-                      16 / factor);
-      }
-    }
-  }
-
-  vpx_extend_frame_borders(dst);
-}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static int scale_down(VP9_COMP *cpi, int q) {
@@ -2396,11 +2723,33 @@ static int scale_down(VP9_COMP *cpi, int q) {
   return scale;
 }
 
-static int big_rate_miss(VP9_COMP *cpi, int high_limit, int low_limit) {
+static int big_rate_miss_high_threshold(VP9_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  int big_miss_high;
 
-  return (rc->projected_frame_size > ((high_limit * 3) / 2)) ||
-         (rc->projected_frame_size < (low_limit / 2));
+  if (frame_is_kf_gf_arf(cpi))
+    big_miss_high = rc->this_frame_target * 3 / 2;
+  else
+    big_miss_high = rc->this_frame_target * 2;
+
+  return big_miss_high;
+}
+
+static int big_rate_miss(VP9_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int big_miss_high;
+  int big_miss_low;
+
+  // Ignore for overlay frames
+  if (rc->is_src_frame_alt_ref) {
+    return 0;
+  } else {
+    big_miss_low = (rc->this_frame_target / 2);
+    big_miss_high = big_rate_miss_high_threshold(cpi);
+
+    return (rc->projected_frame_size > big_miss_high) ||
+           (rc->projected_frame_size < big_miss_low);
+  }
 }
 
 // test in two pass for the first
@@ -2425,8 +2774,7 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
   int force_recode = 0;
 
   if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
-      big_rate_miss(cpi, high_limit, low_limit) ||
-      (cpi->sf.recode_loop == ALLOW_RECODE) ||
+      big_rate_miss(cpi) || (cpi->sf.recode_loop == ALLOW_RECODE) ||
       (two_pass_first_group_inter(cpi) &&
        (cpi->sf.recode_loop == ALLOW_RECODE_FIRST)) ||
       (frame_is_kfgfarf && (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF))) {
@@ -2437,6 +2785,13 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
       return 1;
     }
 
+    // Force recode for extreme overshoot.
+    if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+        (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF &&
+         rc->projected_frame_size >= big_rate_miss_high_threshold(cpi))) {
+      return 1;
+    }
+
     // TODO(agrange) high_limit could be greater than the scale-down threshold.
     if ((rc->projected_frame_size > high_limit && q < maxq) ||
         (rc->projected_frame_size < low_limit && q > minq)) {
@@ -2523,12 +2878,41 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
              sizeof(cpi->interp_filter_selected[0]));
   }
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 &&
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
       cpi->denoiser.denoising_level > kDenLowLow) {
+    int svc_base_is_key = 0;
+    int denoise_svc_second_layer = 0;
+    if (cpi->use_svc) {
+      int realloc_fail = 0;
+      const int svc_buf_shift =
+          cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
+              ? cpi->denoiser.num_ref_frames
+              : 0;
+      int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
+                                   cpi->svc.temporal_layer_id,
+                                   cpi->svc.number_temporal_layers);
+      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+      svc_base_is_key = lc->is_key_frame;
+      denoise_svc_second_layer =
+          cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 ? 1
+                                                                          : 0;
+      // Check if we need to allocate extra buffers in the denoiser
+      // for
+      // refreshed frames.
+      realloc_fail = vp9_denoiser_realloc_svc(
+          cm, &cpi->denoiser, svc_buf_shift, cpi->refresh_alt_ref_frame,
+          cpi->refresh_golden_frame, cpi->refresh_last_frame, cpi->alt_fb_idx,
+          cpi->gld_fb_idx, cpi->lst_fb_idx);
+      if (realloc_fail)
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to re-allocate denoiser for SVC");
+    }
     vp9_denoiser_update_frame_info(
         &cpi->denoiser, *cpi->Source, cpi->common.frame_type,
         cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame,
-        cpi->refresh_last_frame, cpi->resize_pending);
+        cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx,
+        cpi->lst_fb_idx, cpi->resize_pending, svc_base_is_key,
+        denoise_svc_second_layer);
   }
 #endif
   if (is_one_pass_cbr_svc(cpi)) {
@@ -2553,6 +2937,10 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
   struct loopfilter *lf = &cm->lf;
 
+  const int is_reference_frame =
+      (cm->frame_type == KEY_FRAME || cpi->refresh_last_frame ||
+       cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame);
+
   if (xd->lossless) {
     lf->filter_level = 0;
     lf->last_filt_level = 0;
@@ -2578,7 +2966,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
     cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
   }
 
-  if (lf->filter_level > 0) {
+  if (lf->filter_level > 0 && is_reference_frame) {
     vp9_build_mask_frame(cm, lf->filter_level, 0);
 
     if (cpi->num_workers > 1)
@@ -2643,7 +3031,8 @@ void vp9_scale_references(VP9_COMP *cpi) {
                                        cm->byte_alignment, NULL, NULL, NULL))
             vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
-          scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth);
+          scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth,
+                                 EIGHTTAP, 0);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
@@ -2666,7 +3055,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
                                        cm->byte_alignment, NULL, NULL, NULL))
             vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
-          vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf);
+          vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
@@ -2794,17 +3183,37 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
     dc_quant_devisor = 4.0;
 #endif
 
-    fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
-       "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
-       "%10"PRId64" %10"PRId64" %10d "
-       "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
-        "%6d %6d %5d %5d %5d "
-        "%10"PRId64" %10.3lf"
-        "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n",
+    if (!cm->current_video_frame) {
+      fprintf(f, "frame, width, height, last ts, last end ts, "
+          "source_alt_ref_pending, source_alt_ref_active, "
+          "this_frame_target, projected_frame_size, "
+          "projected_frame_size / MBs, "
+          "projected_frame_size - this_frame_target, "
+          "vbr_bits_off_target, vbr_bits_off_target_fast, "
+          "twopass.extend_minq, twopass.extend_minq_fast, "
+          "total_target_vs_actual, "
+          "starting_buffer_level - bits_off_target, "
+          "total_actual_bits, base_qindex, q for base_qindex, "
+          "dc quant, q for active_worst_quality, avg_q, q for oxcf.cq_level, "
+          "refresh_last_frame, refresh_golden_frame, refresh_alt_ref_frame, "
+          "frame_type, gfu_boost, "
+          "twopass.bits_left, "
+          "twopass.total_left_stats.coded_error, "
+          "twopass.bits_left / (1 + twopass.total_left_stats.coded_error), "
+          "tot_recode_hits, recon_err, kf_boost, "
+          "twopass.kf_zeromotion_pct, twopass.fr_content_type, "
+          "filter_level, seg.aq_av_offset\n");
+    }
+
+    fprintf(f, "%10u, %d, %d, %10"PRId64", %10"PRId64", %d, %d, %10d, %10d, "
+        "%10d, %10d, %10"PRId64", %10"PRId64", %5d, %5d, %10"PRId64", "
+        "%10"PRId64", %10"PRId64", %10d, %7.2lf, %7.2lf, %7.2lf, %7.2lf, "
+        "%7.2lf, %6d, %6d, %5d, %5d, %5d, %10"PRId64", %10.3lf, %10lf, %8u, "
+        "%10"PRId64", %10d, %10d, %10d, %10d, %10d\n",
         cpi->common.current_video_frame,
         cm->width, cm->height,
-        cpi->td.rd_counts.m_search_count,
-        cpi->td.rd_counts.ex_search_count,
+        cpi->last_time_stamp_seen,
+        cpi->last_end_time_stamp_seen,
         cpi->rc.source_alt_ref_pending,
         cpi->rc.source_alt_ref_active,
         cpi->rc.this_frame_target,
@@ -2892,7 +3301,6 @@ static void set_size_independent_vars(VP9_COMP *cpi) {
 static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
                                     int *top_index) {
   VP9_COMMON *const cm = &cpi->common;
-  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
 
   // Setup variables that depend on the dimensions of the frame.
   vp9_set_speed_features_framesize_dependent(cpi);
@@ -2904,17 +3312,19 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
     vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
   }
 
+#if !CONFIG_REALTIME_ONLY
   // Configure experimental use of segmentation for enhanced coding of
   // static regions if indicated.
   // Only allowed in the second pass of a two pass encode, as it requires
   // lagged coding, and if the relevant speed feature flag is set.
-  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+  if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation)
     configure_static_seg_features(cpi);
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_VP9_POSTPROC && !(CONFIG_VP9_TEMPORAL_DENOISING)
-  if (oxcf->noise_sensitivity > 0) {
+  if (cpi->oxcf.noise_sensitivity > 0) {
     int l = 0;
-    switch (oxcf->noise_sensitivity) {
+    switch (cpi->oxcf.noise_sensitivity) {
       case 1: l = 20; break;
       case 2: l = 40; break;
       case 3: l = 60; break;
@@ -2923,8 +3333,9 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
       case 6: l = 150; break;
     }
     if (!cpi->common.postproc_state.limits) {
-      cpi->common.postproc_state.limits = vpx_calloc(
-          cpi->common.width, sizeof(*cpi->common.postproc_state.limits));
+      cpi->common.postproc_state.limits =
+          vpx_calloc(cpi->un_scaled_source->y_width,
+                     sizeof(*cpi->common.postproc_state.limits));
     }
     vp9_denoise(cpi->Source, cpi->Source, l, cpi->common.postproc_state.limits);
   }
@@ -2936,7 +3347,8 @@ static void setup_denoiser_buffer(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (cpi->oxcf.noise_sensitivity > 0 &&
       !cpi->denoiser.frame_buffer_initialized) {
-    if (vp9_denoiser_alloc(&cpi->denoiser, cm->width, cm->height,
+    if (vp9_denoiser_alloc(cm, &cpi->svc, &cpi->denoiser, cpi->use_svc,
+                           cpi->oxcf.noise_sensitivity, cm->width, cm->height,
                            cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                            cm->use_highbitdepth,
@@ -2964,6 +3376,7 @@ static void set_frame_size(VP9_COMP *cpi) {
   VP9EncoderConfig *const oxcf = &cpi->oxcf;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
+#if !CONFIG_REALTIME_ONLY
   if (oxcf->pass == 2 && oxcf->rc_mode == VPX_VBR &&
       ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
        (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) {
@@ -2974,6 +3387,7 @@ static void set_frame_size(VP9_COMP *cpi) {
     vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
                          oxcf->scaled_frame_height);
   }
+#endif  // !CONFIG_REALTIME_ONLY
 
   if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && !cpi->use_svc &&
       oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending != 0) {
@@ -3054,6 +3468,19 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
                                        uint8_t *dest) {
   VP9_COMMON *const cm = &cpi->common;
   int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
+  const INTERP_FILTER filter_scaler =
+      (is_one_pass_cbr_svc(cpi))
+          ? cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id]
+          : EIGHTTAP;
+  const int phase_scaler =
+      (is_one_pass_cbr_svc(cpi))
+          ? cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id]
+          : 0;
+
+  // Flag to check if its valid to compute the source sad (used for
+  // scene detection and for superblock content state in CBR mode).
+  // The flag may get reset below based on SVC or resizing state.
+  cpi->compute_source_sad_onepass = cpi->oxcf.mode == REALTIME;
 
   vpx_clear_system_state();
 
@@ -3067,8 +3494,11 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     // For svc, if it is a 1/4x1/4 downscaling, do a two-stage scaling to take
     // advantage of the 1:2 optimized scaler. In the process, the 1/2x1/2
     // result will be saved in scaled_temp and might be used later.
+    const INTERP_FILTER filter_scaler2 = cpi->svc.downsample_filter_type[1];
+    const int phase_scaler2 = cpi->svc.downsample_filter_phase[1];
     cpi->Source = vp9_svc_twostage_scale(
-        cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp);
+        cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp,
+        filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
     cpi->svc.scaled_one_half = 1;
   } else if (is_one_pass_cbr_svc(cpi) &&
              cpi->un_scaled_source->y_width == cm->width << 1 &&
@@ -3080,16 +3510,17 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     cpi->svc.scaled_one_half = 0;
   } else {
     cpi->Source = vp9_scale_if_required(
-        cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0));
+        cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0),
+        filter_scaler, phase_scaler);
   }
   // Unfiltered raw source used in metrics calculation if the source
   // has been filtered.
   if (is_psnr_calc_enabled(cpi)) {
 #ifdef ENABLE_KF_DENOISE
     if (is_spatial_denoise_enabled(cpi)) {
-      cpi->raw_source_frame =
-          vp9_scale_if_required(cm, &cpi->raw_unscaled_source,
-                                &cpi->raw_scaled_source, (cpi->oxcf.pass == 0));
+      cpi->raw_source_frame = vp9_scale_if_required(
+          cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
+          (cpi->oxcf.pass == 0), EIGHTTAP, phase_scaler);
     } else {
       cpi->raw_source_frame = cpi->Source;
     }
@@ -3098,8 +3529,21 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 #endif
   }
 
+  if ((cpi->use_svc &&
+       (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1 ||
+        cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 ||
+        cpi->svc.current_superframe < 1)) ||
+      cpi->resize_pending || cpi->resize_state || cpi->external_resize ||
+      cpi->resize_state != ORIG) {
+    cpi->compute_source_sad_onepass = 0;
+    if (cpi->content_state_sb_fd != NULL)
+      memset(cpi->content_state_sb_fd, 0,
+             (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) *
+                 sizeof(*cpi->content_state_sb_fd));
+  }
+
   // Avoid scaling last_source unless its needed.
-  // Last source is needed if vp9_avg_source_sad() is used, or if
+  // Last source is needed if avg_source_sad() is used, or if
   // partition_search_type == SOURCE_VAR_BASED_PARTITION, or if noise
   // estimation is enabled.
   if (cpi->unscaled_last_source != NULL &&
@@ -3107,10 +3551,16 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
        (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_VBR &&
         cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5) ||
        cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION ||
-       cpi->noise_estimate.enabled))
-    cpi->Last_Source =
-        vp9_scale_if_required(cm, cpi->unscaled_last_source,
-                              &cpi->scaled_last_source, (cpi->oxcf.pass == 0));
+       (cpi->noise_estimate.enabled && !cpi->oxcf.noise_sensitivity) ||
+       cpi->compute_source_sad_onepass))
+    cpi->Last_Source = vp9_scale_if_required(
+        cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+        (cpi->oxcf.pass == 0), EIGHTTAP, 0);
+
+  if (cpi->Last_Source == NULL ||
+      cpi->Last_Source->y_width != cpi->Source->y_width ||
+      cpi->Last_Source->y_height != cpi->Source->y_height)
+    cpi->compute_source_sad_onepass = 0;
 
   if (cm->frame_type == KEY_FRAME || cpi->resize_pending != 0) {
     memset(cpi->consec_zero_mv, 0,
@@ -3119,22 +3569,39 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 
   vp9_update_noise_estimate(cpi);
 
-  if (cpi->oxcf.pass == 0 && cpi->oxcf.mode == REALTIME &&
-      cpi->oxcf.speed >= 5 && cpi->resize_state == 0 &&
-      (cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
-       cpi->oxcf.rc_mode == VPX_VBR))
-    vp9_avg_source_sad(cpi);
+  // Scene detection is always used for VBR mode or screen-content case.
+  // For other cases (e.g., CBR mode) use it for 5 <= speed < 8 for now
+  // (need to check encoding time cost for doing this for speed 8).
+  cpi->rc.high_source_sad = 0;
+  if (cpi->compute_source_sad_onepass && cm->show_frame &&
+      (cpi->oxcf.rc_mode == VPX_VBR ||
+       cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
+       (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8 && !cpi->use_svc)))
+    vp9_scene_detection_onepass(cpi);
 
-  // For 1 pass SVC, since only ZEROMV is allowed for upsampled reference
-  // frame (i.e, svc->force_zero_mode_spatial_ref = 0), we can avoid this
-  // frame-level upsampling.
-  if (frame_is_intra_only(cm) == 0 && !is_one_pass_cbr_svc(cpi)) {
+  // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame
+  // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can
+  // avoid this frame-level upsampling (for non intra_only frames).
+  if (frame_is_intra_only(cm) == 0 &&
+      !(is_one_pass_cbr_svc(cpi) && cpi->svc.force_zero_mode_spatial_ref)) {
     vp9_scale_references(cpi);
   }
 
   set_size_independent_vars(cpi);
   set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
+  if (cpi->sf.copy_partition_flag) alloc_copy_partition_data(cpi);
+
+  if (cpi->sf.svc_use_lowres_part &&
+      cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) {
+    if (cpi->svc.prev_partition_svc == NULL) {
+      CHECK_MEM_ERROR(
+          cm, cpi->svc.prev_partition_svc,
+          (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
+                                   sizeof(*cpi->svc.prev_partition_svc)));
+    }
+  }
+
   if (cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 &&
       cpi->oxcf.rc_mode == VPX_CBR &&
       cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
@@ -3143,7 +3610,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   }
 
   vp9_set_quantizer(cm, q);
-  vp9_set_variance_partition_thresholds(cpi, q);
+  vp9_set_variance_partition_thresholds(cpi, q, 0);
 
   setup_frame(cpi);
 
@@ -3172,7 +3639,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   // Check if we should drop this frame because of high overshoot.
   // Only for frames where high temporal-source SAD is detected.
   if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
-      cpi->resize_state == 0 && cm->frame_type != KEY_FRAME &&
+      cpi->resize_state == ORIG && cm->frame_type != KEY_FRAME &&
       cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
       cpi->rc.high_source_sad == 1) {
     int frame_size = 0;
@@ -3186,7 +3653,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     if (vp9_encodedframe_overshoot(cpi, frame_size, &q)) {
       vpx_clear_system_state();
       vp9_set_quantizer(cm, q);
-      vp9_set_variance_partition_thresholds(cpi, q);
+      vp9_set_variance_partition_thresholds(cpi, q, 0);
       suppress_active_map(cpi);
       // Turn-off cyclic refresh for re-encoded frame.
       if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
@@ -3199,12 +3666,10 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     }
   }
 
-  // Update some stats from cyclic refresh, and check if we should not update
-  // golden reference, for non-SVC 1 pass CBR.
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->frame_type != KEY_FRAME &&
-      !cpi->use_svc && cpi->ext_refresh_frame_flags_pending == 0 &&
-      (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR))
-    vp9_cyclic_refresh_check_golden_update(cpi);
+  // Update some stats from cyclic refresh, and check for golden frame update.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+      cm->frame_type != KEY_FRAME)
+    vp9_cyclic_refresh_postencode(cpi);
 
   // Update the skip mb flag probabilities based on the distribution
   // seen in the last encoder iteration.
@@ -3212,8 +3677,16 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   vpx_clear_system_state();
 }
 
+#define MAX_QSTEP_ADJ 4
+static int get_qstep_adj(int rate_excess, int rate_limit) {
+  int qstep =
+      rate_limit ? ((rate_excess + rate_limit / 2) / rate_limit) : INT_MAX;
+  return VPXMIN(qstep, MAX_QSTEP_ADJ);
+}
+
 static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
                                     uint8_t *dest) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int bottom_index, top_index;
@@ -3225,9 +3698,17 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
   int q = 0, q_low = 0, q_high = 0;
+  int enable_acl;
+#ifdef AGGRESSIVE_VBR
+  int qrange_adj = 1;
+#endif
 
   set_size_independent_vars(cpi);
 
+  enable_acl = cpi->sf.allow_acl
+                   ? (cm->frame_type == KEY_FRAME) || (cm->show_frame == 0)
+                   : 0;
+
   do {
     vpx_clear_system_state();
 
@@ -3236,6 +3717,16 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
     if (loop_count == 0 || cpi->resize_pending != 0) {
       set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
+#ifdef AGGRESSIVE_VBR
+      if (two_pass_first_group_inter(cpi)) {
+        // Adjustment limits for min and max q
+        qrange_adj = VPXMAX(1, (top_index - bottom_index) / 2);
+
+        bottom_index =
+            VPXMAX(bottom_index - qrange_adj / 2, oxcf->best_allowed_q);
+        top_index = VPXMIN(oxcf->worst_allowed_q, top_index + qrange_adj / 2);
+      }
+#endif
       // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
       set_mv_search_params(cpi);
 
@@ -3259,8 +3750,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
                                        &frame_over_shoot_limit);
     }
 
-    cpi->Source = vp9_scale_if_required(
-        cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0));
+    cpi->Source =
+        vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source,
+                              (oxcf->pass == 0), EIGHTTAP, 0);
 
     // Unfiltered raw source used in metrics calculation if the source
     // has been filtered.
@@ -3269,7 +3761,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
       if (is_spatial_denoise_enabled(cpi)) {
         cpi->raw_source_frame = vp9_scale_if_required(
             cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
-            (cpi->oxcf.pass == 0));
+            (oxcf->pass == 0), EIGHTTAP, 0);
       } else {
         cpi->raw_source_frame = cpi->Source;
       }
@@ -3281,7 +3773,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
     if (cpi->unscaled_last_source != NULL)
       cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
                                                &cpi->scaled_last_source,
-                                               (cpi->oxcf.pass == 0));
+                                               (oxcf->pass == 0), EIGHTTAP, 0);
 
     if (frame_is_intra_only(cm) == 0) {
       if (loop_count > 0) {
@@ -3296,13 +3788,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
 
     // Variance adaptive and in frame q adjustment experiments are mutually
     // exclusive.
-    if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+    if (oxcf->aq_mode == VARIANCE_AQ) {
       vp9_vaq_frame_setup(cpi);
-    } else if (cpi->oxcf.aq_mode == EQUATOR360_AQ) {
+    } else if (oxcf->aq_mode == EQUATOR360_AQ) {
       vp9_360aq_frame_setup(cpi);
-    } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    } else if (oxcf->aq_mode == COMPLEXITY_AQ) {
       vp9_setup_in_frame_q_adj(cpi);
-    } else if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) {
+    } else if (oxcf->aq_mode == LOOKAHEAD_AQ) {
       vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
     }
 
@@ -3322,12 +3814,11 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
       if (!cpi->sf.use_nonrd_pick_mode) vp9_pack_bitstream(cpi, dest, size);
 
       rc->projected_frame_size = (int)(*size) << 3;
-      restore_coding_context(cpi);
 
       if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
     }
 
-    if (cpi->oxcf.rc_mode == VPX_Q) {
+    if (oxcf->rc_mode == VPX_Q) {
       loop = 0;
     } else {
       if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced &&
@@ -3385,6 +3876,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
         // to attempt to recode.
         int last_q = q;
         int retries = 0;
+        int qstep;
 
         if (cpi->resize_pending == 1) {
           // Change in frame size so go back around the recode loop.
@@ -3406,11 +3898,24 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
         // Frame is too large
         if (rc->projected_frame_size > rc->this_frame_target) {
           // Special case if the projected size is > the max allowed.
-          if (rc->projected_frame_size >= rc->max_frame_bandwidth)
-            q_high = rc->worst_quality;
+          if ((q == q_high) &&
+              ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+               (rc->projected_frame_size >=
+                big_rate_miss_high_threshold(cpi)))) {
+            int max_rate = VPXMAX(1, VPXMIN(rc->max_frame_bandwidth,
+                                            big_rate_miss_high_threshold(cpi)));
+            double q_val_high;
+            q_val_high = vp9_convert_qindex_to_q(q_high, cm->bit_depth);
+            q_val_high =
+                q_val_high * ((double)rc->projected_frame_size / max_rate);
+            q_high = vp9_convert_q_to_qindex(q_val_high, cm->bit_depth);
+            q_high = clamp(q_high, rc->best_quality, rc->worst_quality);
+          }
 
           // Raise Qlow as to at least the current value
-          q_low = q < q_high ? q + 1 : q_high;
+          qstep =
+              get_qstep_adj(rc->projected_frame_size, rc->this_frame_target);
+          q_low = VPXMIN(q + qstep, q_high);
 
           if (undershoot_seen || loop_at_this_size > 1) {
             // Update rate_correction_factor unless
@@ -3435,31 +3940,32 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
           overshoot_seen = 1;
         } else {
           // Frame is too small
-          q_high = q > q_low ? q - 1 : q_low;
+          qstep =
+              get_qstep_adj(rc->this_frame_target, rc->projected_frame_size);
+          q_high = VPXMAX(q - qstep, q_low);
 
           if (overshoot_seen || loop_at_this_size > 1) {
             vp9_rc_update_rate_correction_factors(cpi);
             q = (q_high + q_low) / 2;
           } else {
             vp9_rc_update_rate_correction_factors(cpi);
-            q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                  top_index);
+            q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                                  VPXMIN(q_low, bottom_index), top_index);
             // Special case reset for qlow for constrained quality.
             // This should only trigger where there is very substantial
             // undershoot on a frame and the auto cq level is above
             // the user passsed in value.
-            if (cpi->oxcf.rc_mode == VPX_CQ && q < q_low) {
+            if (oxcf->rc_mode == VPX_CQ && q < q_low) {
               q_low = q;
             }
 
             while (q > q_high && retries < 10) {
               vp9_rc_update_rate_correction_factors(cpi);
-              q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                    top_index);
+              q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                                    VPXMIN(q_low, bottom_index), top_index);
               retries++;
             }
           }
-
           undershoot_seen = 1;
         }
 
@@ -3485,7 +3991,50 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
       ++cpi->tot_recode_hits;
 #endif
     }
+
+    if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF)
+      if (loop || !enable_acl) restore_coding_context(cpi);
   } while (loop);
+
+#ifdef AGGRESSIVE_VBR
+  if (two_pass_first_group_inter(cpi)) {
+    cpi->twopass.active_worst_quality =
+        VPXMIN(q + qrange_adj, oxcf->worst_allowed_q);
+  } else if (!frame_is_kf_gf_arf(cpi)) {
+#else
+  if (!frame_is_kf_gf_arf(cpi)) {
+#endif
+    // Have we been forced to adapt Q outside the expected range by an extreme
+    // rate miss. If so adjust the active maxQ for the subsequent frames.
+    if (q > cpi->twopass.active_worst_quality) {
+      cpi->twopass.active_worst_quality = q;
+    } else if (oxcf->vbr_corpus_complexity && q == q_low &&
+               rc->projected_frame_size < rc->this_frame_target) {
+      cpi->twopass.active_worst_quality =
+          VPXMAX(q, cpi->twopass.active_worst_quality - 1);
+    }
+  }
+
+  if (enable_acl) {
+    // Skip recoding, if model diff is below threshold
+    const int thresh = compute_context_model_thresh(cpi);
+    const int diff = compute_context_model_diff(cm);
+    if (diff < thresh) {
+      vpx_clear_system_state();
+      restore_coding_context(cpi);
+      return;
+    }
+
+    vp9_encode_frame(cpi);
+    vpx_clear_system_state();
+    restore_coding_context(cpi);
+    vp9_pack_bitstream(cpi, dest, size);
+
+    vp9_encode_frame(cpi);
+    vpx_clear_system_state();
+
+    restore_coding_context(cpi);
+  }
 }
 
 static int get_ref_frame_flags(const VP9_COMP *cpi) {
@@ -3525,18 +4074,28 @@ static void set_ext_overrides(VP9_COMP *cpi) {
   }
 }
 
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
-                                           YV12_BUFFER_CONFIG *unscaled,
-                                           YV12_BUFFER_CONFIG *scaled,
-                                           YV12_BUFFER_CONFIG *scaled_temp) {
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
+    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
+    int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) {
   if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
       cm->mi_rows * MI_SIZE != unscaled->y_height) {
 #if CONFIG_VP9_HIGHBITDEPTH
-    scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth);
-    scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth);
+    if (cm->bit_depth == VPX_BITS_8) {
+      vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+                                 phase_scaler2);
+      vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type,
+                                 phase_scaler);
+    } else {
+      scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth,
+                             filter_type2, phase_scaler2);
+      scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth,
+                             filter_type, phase_scaler);
+    }
 #else
-    vp9_scale_and_extend_frame(unscaled, scaled_temp);
-    vp9_scale_and_extend_frame(scaled_temp, scaled);
+    vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+                               phase_scaler2);
+    vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     return scaled;
   } else {
@@ -3544,22 +4103,25 @@ YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
   }
 }
 
-YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
-                                          YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled,
-                                          int use_normative_scaler) {
+YV12_BUFFER_CONFIG *vp9_scale_if_required(
+    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) {
   if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
       cm->mi_rows * MI_SIZE != unscaled->y_height) {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
         unscaled->y_height <= (scaled->y_height << 1))
-      scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth);
+      if (cm->bit_depth == VPX_BITS_8)
+        vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler);
+      else
+        scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth,
+                               filter_type, phase_scaler);
     else
       scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
 #else
     if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
         unscaled->y_height <= (scaled->y_height << 1))
-      vp9_scale_and_extend_frame(unscaled, scaled);
+      vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler);
     else
       scale_and_extend_frame_nonnormative(unscaled, scaled);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -3892,12 +4454,14 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
       ++cm->current_video_frame;
       cpi->ext_refresh_frame_flags_pending = 0;
       cpi->svc.rc_drop_superframe = 1;
+      cpi->last_frame_dropped = 1;
       // TODO(marpan): Advancing the svc counters on dropped frames can break
       // the referencing scheme for the fixed svc patterns defined in
       // vp9_one_pass_cbr_svc_start_layer(). Look into fixing this issue, but
       // for now, don't advance the svc frame counters on dropped frame.
       // if (cpi->use_svc)
       //   vp9_inc_frame_in_layer(cpi);
+
       return;
     }
   }
@@ -3915,21 +4479,23 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
     encode_with_recode_loop(cpi, size, dest);
   }
 
+  cpi->last_frame_dropped = 0;
+
   // Disable segmentation if it decrease rate/distortion ratio
   if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ)
     vp9_try_disable_lookahead_aq(cpi, size, dest);
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
-  if (oxcf->noise_sensitivity > 0) {
-    vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
-                            yuv_denoised_file);
+  if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) {
+    vpx_write_yuv_frame(yuv_denoised_file,
+                        &cpi->denoiser.running_avg_y[INTRA_FRAME]);
   }
 #endif
 #endif
 #ifdef OUTPUT_YUV_SKINMAP
   if (cpi->common.current_video_frame > 1) {
-    vp9_compute_skin_map(cpi, yuv_skinmap_file);
+    vp9_output_skin_map(cpi, yuv_skinmap_file);
   }
 #endif
 
@@ -4066,6 +4632,7 @@ static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
                         unsigned int *frame_flags) {
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
@@ -4074,6 +4641,7 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
   if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
     vp9_twopass_postencode_update(cpi);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void init_ref_frame_bufs(VP9_COMMON *cm) {
   int i;
@@ -4268,6 +4836,26 @@ static void adjust_image_stat(double y, double u, double v, double all,
 }
 #endif  // CONFIG_INTERNAL_STATS
 
+// Adjust the maximum allowable frame size for the target level.
+static void level_rc_framerate(VP9_COMP *cpi, int arf_src_index) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  LevelConstraint *const ls = &cpi->level_constraint;
+  VP9_COMMON *const cm = &cpi->common;
+  const double max_cpb_size = ls->max_cpb_size;
+  vpx_clear_system_state();
+  rc->max_frame_bandwidth = VPXMIN(rc->max_frame_bandwidth, ls->max_frame_size);
+  if (frame_is_intra_only(cm)) {
+    rc->max_frame_bandwidth =
+        VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.5));
+  } else if (arf_src_index > 0) {
+    rc->max_frame_bandwidth =
+        VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.4));
+  } else {
+    rc->max_frame_bandwidth =
+        VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.2));
+  }
+}
+
 static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
   VP9_COMMON *const cm = &cpi->common;
   Vp9LevelInfo *const level_info = &cpi->level_info;
@@ -4276,6 +4864,9 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
   int i, idx;
   uint64_t luma_samples, dur_end;
   const uint32_t luma_pic_size = cm->width * cm->height;
+  const uint32_t luma_pic_breadth = VPXMAX(cm->width, cm->height);
+  LevelConstraint *const level_constraint = &cpi->level_constraint;
+  const int8_t level_index = level_constraint->level_index;
   double cpb_data_size;
 
   vpx_clear_system_state();
@@ -4377,6 +4968,11 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
     level_spec->max_luma_picture_size = luma_pic_size;
   }
 
+  // update max_luma_picture_breadth
+  if (luma_pic_breadth > level_spec->max_luma_picture_breadth) {
+    level_spec->max_luma_picture_breadth = luma_pic_breadth;
+  }
+
   // update compression_ratio
   level_spec->compression_ratio = (double)level_stats->total_uncompressed_size *
                                   cm->bit_depth /
@@ -4386,6 +4982,87 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
   if (level_spec->max_col_tiles < (1 << cm->log2_tile_cols)) {
     level_spec->max_col_tiles = (1 << cm->log2_tile_cols);
   }
+
+  if (level_index >= 0 && level_constraint->fail_flag == 0) {
+    if (level_spec->max_luma_picture_size >
+        vp9_level_defs[level_index].max_luma_picture_size) {
+      level_constraint->fail_flag |= (1 << LUMA_PIC_SIZE_TOO_LARGE);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]);
+    }
+
+    if (level_spec->max_luma_picture_breadth >
+        vp9_level_defs[level_index].max_luma_picture_breadth) {
+      level_constraint->fail_flag |= (1 << LUMA_PIC_BREADTH_TOO_LARGE);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[LUMA_PIC_BREADTH_TOO_LARGE]);
+    }
+
+    if ((double)level_spec->max_luma_sample_rate >
+        (double)vp9_level_defs[level_index].max_luma_sample_rate *
+            (1 + SAMPLE_RATE_GRACE_P)) {
+      level_constraint->fail_flag |= (1 << LUMA_SAMPLE_RATE_TOO_LARGE);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[LUMA_SAMPLE_RATE_TOO_LARGE]);
+    }
+
+    if (level_spec->max_col_tiles > vp9_level_defs[level_index].max_col_tiles) {
+      level_constraint->fail_flag |= (1 << TOO_MANY_COLUMN_TILE);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[TOO_MANY_COLUMN_TILE]);
+    }
+
+    if (level_spec->min_altref_distance <
+        vp9_level_defs[level_index].min_altref_distance) {
+      level_constraint->fail_flag |= (1 << ALTREF_DIST_TOO_SMALL);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[ALTREF_DIST_TOO_SMALL]);
+    }
+
+    if (level_spec->max_ref_frame_buffers >
+        vp9_level_defs[level_index].max_ref_frame_buffers) {
+      level_constraint->fail_flag |= (1 << TOO_MANY_REF_BUFFER);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[TOO_MANY_REF_BUFFER]);
+    }
+
+    if (level_spec->max_cpb_size > vp9_level_defs[level_index].max_cpb_size) {
+      level_constraint->fail_flag |= (1 << CPB_TOO_LARGE);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[CPB_TOO_LARGE]);
+    }
+
+    // Set an upper bound for the next frame size. It will be used in
+    // level_rc_framerate() before encoding the next frame.
+    cpb_data_size = 0;
+    for (i = 0; i < CPB_WINDOW_SIZE - 1; ++i) {
+      if (i >= level_stats->frame_window_buffer.len) break;
+      idx = (level_stats->frame_window_buffer.start +
+             level_stats->frame_window_buffer.len - 1 - i) %
+            FRAME_WINDOW_SIZE;
+      cpb_data_size += level_stats->frame_window_buffer.buf[idx].size;
+    }
+    cpb_data_size = cpb_data_size / 125.0;
+    level_constraint->max_frame_size =
+        (int)((vp9_level_defs[level_index].max_cpb_size - cpb_data_size) *
+              1000.0);
+    if (level_stats->frame_window_buffer.len < CPB_WINDOW_SIZE - 1)
+      level_constraint->max_frame_size >>= 1;
+  }
 }
 
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
@@ -4474,8 +5151,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       }
       cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1;
 #endif
-
-      if ((oxcf->arnr_max_frames > 0) && (oxcf->arnr_strength > 0)) {
+#if !CONFIG_REALTIME_ONLY
+      if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) &&
+          (oxcf->arnr_strength > 0)) {
         int bitrate = cpi->rc.avg_frame_bandwidth / 40;
         int not_low_bitrate = bitrate > ALT_REF_AQ_LOW_BITRATE_BOUNDARY;
 
@@ -4493,7 +5171,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
         force_src_buffer = &cpi->alt_ref_buffer;
       }
-
+#endif
       cm->show_frame = 0;
       cm->intra_only = 0;
       cpi->refresh_alt_ref_frame = 1;
@@ -4524,8 +5202,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       cm->intra_only = 0;
       // if the flags indicate intra frame, but if the current picture is for
       // non-zero spatial layer, it should not be an intra picture.
-      // TODO(Won Kap): this needs to change if per-layer intra frame is
-      // allowed.
       if ((source->flags & VPX_EFLAG_FORCE_KF) &&
           cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) {
         source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
@@ -4554,10 +5230,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
   } else {
     *size = 0;
+#if !CONFIG_REALTIME_ONLY
     if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
       vp9_end_first_pass(cpi); /* get last stats packet */
       cpi->twopass.first_pass_done = 1;
     }
+#endif  // !CONFIG_REALTIME_ONLY
     return -1;
   }
 
@@ -4604,6 +5282,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
   cpi->frame_flags = *frame_flags;
 
+#if !CONFIG_REALTIME_ONLY
   if ((oxcf->pass == 2) &&
       (!cpi->use_svc || (is_two_pass_svc(cpi) &&
                          cpi->svc.encode_empty_frame_state != ENCODING))) {
@@ -4611,25 +5290,39 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   } else if (oxcf->pass == 1) {
     set_frame_size(cpi);
   }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  if (oxcf->pass != 1 && cpi->level_constraint.level_index >= 0 &&
+      cpi->level_constraint.fail_flag == 0)
+    level_rc_framerate(cpi, arf_src_index);
 
   if (cpi->oxcf.pass != 0 || cpi->use_svc || frame_is_intra_only(cm) == 1) {
     for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
   }
 
+  cpi->td.mb.fp_src_pred = 0;
+#if CONFIG_REALTIME_ONLY
+  if (cpi->use_svc) {
+    SvcEncode(cpi, size, dest, frame_flags);
+  } else {
+    // One pass encode
+    Pass0Encode(cpi, size, dest, frame_flags);
+  }
+#else  // !CONFIG_REALTIME_ONLY
   if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
     const int lossless = is_lossless_requested(oxcf);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cpi->oxcf.use_highbitdepth)
-      cpi->td.mb.fwd_txm4x4 =
+      cpi->td.mb.fwd_txfm4x4 =
           lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
     else
-      cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
-    cpi->td.mb.highbd_itxm_add =
+      cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+    cpi->td.mb.highbd_inv_txfm_add =
         lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add;
 #else
-    cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+    cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    cpi->td.mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+    cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
     vp9_first_pass(cpi, source);
   } else if (oxcf->pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
     Pass2Encode(cpi, size, dest, frame_flags);
@@ -4639,6 +5332,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     // One pass encode
     Pass0Encode(cpi, size, dest, frame_flags);
   }
+#endif  // CONFIG_REALTIME_ONLY
 
   if (cm->refresh_frame_context)
     cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
@@ -4985,3 +5679,28 @@ void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags) {
     vp9_update_entropy(cpi, 0);
   }
 }
+
+void vp9_set_row_mt(VP9_COMP *cpi) {
+  // Enable row based multi-threading for supported modes of encoding
+  cpi->row_mt = 0;
+  if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) &&
+       cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) &&
+      cpi->oxcf.row_mt && !cpi->use_svc)
+    cpi->row_mt = 1;
+
+  if (cpi->oxcf.mode == GOOD && cpi->oxcf.speed < 5 &&
+      (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) && cpi->oxcf.row_mt &&
+      !cpi->use_svc)
+    cpi->row_mt = 1;
+
+  // In realtime mode, enable row based multi-threading for all the speed levels
+  // where non-rd path is used.
+  if (cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5 && cpi->oxcf.row_mt) {
+    cpi->row_mt = 1;
+  }
+
+  if (cpi->row_mt)
+    cpi->row_mt_bit_exact = 1;
+  else
+    cpi->row_mt_bit_exact = 0;
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_encoder.h b/libs/libvpx/vp9/encoder/vp9_encoder.h
index 66e41492b5..d723d93cbc 100644
--- a/libs/libvpx/vp9/encoder/vp9_encoder.h
+++ b/libs/libvpx/vp9/encoder/vp9_encoder.h
@@ -33,7 +33,9 @@
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_context_tree.h"
 #include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_job_queue.h"
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/encoder/vp9_mbgraph.h"
 #include "vp9/encoder/vp9_mcomp.h"
@@ -129,6 +131,16 @@ typedef enum {
   RESIZE_DYNAMIC = 2  // Coded size of each frame is determined by the codec.
 } RESIZE_TYPE;
 
+typedef enum {
+  kInvalid = 0,
+  kLowSadLowSumdiff = 1,
+  kLowSadHighSumdiff = 2,
+  kHighSadLowSumdiff = 3,
+  kHighSadHighSumdiff = 4,
+  kLowVarHighSumdiff = 5,
+  kVeryHighSad = 6,
+} CONTENT_STATE_SB;
+
 typedef struct VP9EncoderConfig {
   BITSTREAM_PROFILE profile;
   vpx_bit_depth_t bit_depth;     // Codec bit-depth.
@@ -197,6 +209,7 @@ typedef struct VP9EncoderConfig {
   int two_pass_vbrbias;  // two pass datarate control tweaks
   int two_pass_vbrmin_section;
   int two_pass_vbrmax_section;
+  int vbr_corpus_complexity;  // 0 indicates corpus vbr disabled
   // END DATARATE CONTROL OPTIONS
   // ----------------------------------------------------------------
 
@@ -237,7 +250,7 @@ typedef struct VP9EncoderConfig {
 
   int max_threads;
 
-  int target_level;
+  unsigned int target_level;
 
   vpx_fixed_buf_t two_pass_stats_in;
   struct vpx_codec_pkt_list *output_pkt_list;
@@ -256,6 +269,9 @@ typedef struct VP9EncoderConfig {
   int render_width;
   int render_height;
   VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
+
+  int row_mt;
+  unsigned int motion_vector_unit_test;
 } VP9EncoderConfig;
 
 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
@@ -267,14 +283,47 @@ typedef struct TileDataEnc {
   TileInfo tile_info;
   int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
   int mode_map[BLOCK_SIZES][MAX_MODES];
+  FIRSTPASS_DATA fp_data;
+  VP9RowMTSync row_mt_sync;
+
+  // Used for adaptive_rd_thresh with row multithreading
+  int *row_base_thresh_freq_fact;
 } TileDataEnc;
 
+typedef struct RowMTInfo {
+  JobQueueHandle job_queue_hdl;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t job_mutex;
+#endif
+} RowMTInfo;
+
+typedef struct {
+  TOKENEXTRA *start;
+  TOKENEXTRA *stop;
+  unsigned int count;
+} TOKENLIST;
+
+typedef struct MultiThreadHandle {
+  int allocated_tile_rows;
+  int allocated_tile_cols;
+  int allocated_vert_unit_rows;
+
+  // Frame level params
+  int num_tile_vert_sbs[MAX_NUM_TILE_ROWS];
+
+  // Job Queue structure and handles
+  JobQueue *job_queue;
+
+  int jobs_per_tile_col;
+
+  RowMTInfo row_mt_info[MAX_NUM_TILE_COLS];
+  int thread_id_to_tile_id[MAX_NUM_THREADS];  // Mapping of threads to tiles
+} MultiThreadHandle;
+
 typedef struct RD_COUNTS {
   vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int64_t comp_pred_diff[REFERENCE_MODES];
   int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
-  int m_search_count;
-  int ex_search_count;
 } RD_COUNTS;
 
 typedef struct ThreadData {
@@ -312,6 +361,7 @@ typedef struct IMAGE_STAT {
 
 typedef enum {
   LEVEL_UNKNOWN = 0,
+  LEVEL_AUTO = 1,
   LEVEL_1 = 10,
   LEVEL_1_1 = 11,
   LEVEL_2 = 20,
@@ -333,6 +383,7 @@ typedef struct {
   VP9_LEVEL level;
   uint64_t max_luma_sample_rate;
   uint32_t max_luma_picture_size;
+  uint32_t max_luma_picture_breadth;
   double average_bitrate;  // in kilobits per second
   double max_cpb_size;     // in kilobits
   double compression_ratio;
@@ -341,6 +392,8 @@ typedef struct {
   uint8_t max_ref_frame_buffers;
 } Vp9LevelSpec;
 
+extern const Vp9LevelSpec vp9_level_defs[VP9_LEVELS];
+
 typedef struct {
   int64_t ts;  // timestamp
   uint32_t luma_samples;
@@ -368,6 +421,35 @@ typedef struct {
   Vp9LevelSpec level_spec;
 } Vp9LevelInfo;
 
+typedef enum {
+  BITRATE_TOO_LARGE = 0,
+  LUMA_PIC_SIZE_TOO_LARGE,
+  LUMA_PIC_BREADTH_TOO_LARGE,
+  LUMA_SAMPLE_RATE_TOO_LARGE,
+  CPB_TOO_LARGE,
+  COMPRESSION_RATIO_TOO_SMALL,
+  TOO_MANY_COLUMN_TILE,
+  ALTREF_DIST_TOO_SMALL,
+  TOO_MANY_REF_BUFFER,
+  TARGET_LEVEL_FAIL_IDS
+} TARGET_LEVEL_FAIL_ID;
+
+typedef struct {
+  int8_t level_index;
+  uint8_t rc_config_updated;
+  uint8_t fail_flag;
+  int max_frame_size;   // in bits
+  double max_cpb_size;  // in bits
+} LevelConstraint;
+
+typedef struct ARNRFilterData {
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+  int strength;
+  int frame_count;
+  int alt_ref_index;
+  struct scale_factors sf;
+} ARNRFilterData;
+
 typedef struct VP9_COMP {
   QUANTS quants;
   ThreadData td;
@@ -418,6 +500,7 @@ typedef struct VP9_COMP {
 
   TOKENEXTRA *tile_tok[4][1 << 6];
   uint32_t tok_count[4][1 << 6];
+  TOKENLIST *tplist[4][1 << 6];
 
   // Ambient reconstruction err target for force key frames
   int64_t ambient_err;
@@ -463,6 +546,8 @@ typedef struct VP9_COMP {
 
   uint8_t *segmentation_map;
 
+  uint8_t *skin_map;
+
   // segment threashold for encode breakout
   int segment_encode_breakout[MAX_SEGMENTS];
 
@@ -470,7 +555,6 @@ typedef struct VP9_COMP {
   ActiveMap active_map;
 
   fractional_mv_step_fp *find_fractional_mv_step;
-  vp9_full_search_fn_t full_search_sad;
   vp9_diamond_search_fn_t diamond_search_sad;
   vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
   uint64_t time_receive_data;
@@ -571,7 +655,7 @@ typedef struct VP9_COMP {
 #endif
 
   int resize_pending;
-  int resize_state;
+  RESIZE_STATE resize_state;
   int external_resize;
   int resize_scale_num;
   int resize_scale_den;
@@ -594,6 +678,8 @@ typedef struct VP9_COMP {
   int64_t vbp_thresholds[4];
   int64_t vbp_threshold_minmax;
   int64_t vbp_threshold_sad;
+  // Threshold used for partition copy
+  int64_t vbp_threshold_copy;
   BLOCK_SIZE vbp_bsize_min;
 
   // Multi-threading
@@ -601,9 +687,42 @@ typedef struct VP9_COMP {
   VPxWorker *workers;
   struct EncWorkerData *tile_thr_data;
   VP9LfSync lf_row_sync;
+  struct VP9BitstreamWorkerData *vp9_bitstream_worker_data;
 
   int keep_level_stats;
   Vp9LevelInfo level_info;
+  MultiThreadHandle multi_thread_ctxt;
+  void (*row_mt_sync_read_ptr)(VP9RowMTSync *const, int, int);
+  void (*row_mt_sync_write_ptr)(VP9RowMTSync *const, int, int, const int);
+  ARNRFilterData arnr_filter_data;
+
+  int row_mt;
+  unsigned int row_mt_bit_exact;
+
+  // Previous Partition Info
+  BLOCK_SIZE *prev_partition;
+  int8_t *prev_segment_id;
+  // Used to save the status of whether a block has a low variance in
+  // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for
+  // 32x32, 9~24 for 16x16.
+  // This is for the last frame and is copied to the current frame
+  // when partition copy happens.
+  uint8_t *prev_variance_low;
+  uint8_t *copied_frame_cnt;
+  uint8_t max_copied_frame;
+  // If the last frame is dropped, we don't copy partition.
+  uint8_t last_frame_dropped;
+
+  // For each superblock: keeps track of the last time (in frame distance) the
+  // the superblock did not have low source sad.
+  uint8_t *content_state_sb_fd;
+
+  int compute_source_sad_onepass;
+
+  LevelConstraint level_constraint;
+
+  uint8_t *count_arf_frame_usage;
+  uint8_t *count_lastgolden_frame_usage;
 } VP9_COMP;
 
 void vp9_initialize_enc(void);
@@ -702,6 +821,20 @@ static INLINE int allocated_tokens(TileInfo tile) {
   return get_token_alloc(tile_mb_rows, tile_mb_cols);
 }
 
+static INLINE void get_start_tok(VP9_COMP *cpi, int tile_row, int tile_col,
+                                 int mi_row, TOKENEXTRA **tok) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+
+  int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 1) >> 1;
+  const int mb_row = (mi_row - tile_info->mi_row_start) >> 1;
+
+  *tok =
+      cpi->tile_tok[tile_row][tile_col] + get_token_alloc(mb_row, tile_mb_cols);
+}
+
 int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 #if CONFIG_VP9_HIGHBITDEPTH
 int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
@@ -714,15 +847,14 @@ void vp9_update_reference_frames(VP9_COMP *cpi);
 
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
 
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
-                                           YV12_BUFFER_CONFIG *unscaled,
-                                           YV12_BUFFER_CONFIG *scaled,
-                                           YV12_BUFFER_CONFIG *scaled_temp);
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
+    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
+    int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2);
 
-YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
-                                          YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled,
-                                          int use_normative_scaler);
+YV12_BUFFER_CONFIG *vp9_scale_if_required(
+    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler);
 
 void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
 
@@ -734,8 +866,18 @@ static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
   return (cpi->use_svc && cpi->oxcf.pass == 0);
 }
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static INLINE int denoise_svc(const struct VP9_COMP *const cpi) {
+  return (!cpi->use_svc ||
+          (cpi->use_svc &&
+           cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise));
+}
+#endif
+
+#define MIN_LOOKAHEAD_FOR_ARFS 4
 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
-  return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 &&
+  return !(cpi->oxcf.mode == REALTIME && cpi->oxcf.rc_mode == VPX_CBR) &&
+         cpi->oxcf.lag_in_frames >= MIN_LOOKAHEAD_FOR_ARFS &&
          (cpi->oxcf.enable_auto_arf &&
           (!is_two_pass_svc(cpi) ||
            cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]));
@@ -758,10 +900,48 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {
   return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
 }
 
+static INLINE int get_num_vert_units(TileInfo tile, int shift) {
+  int num_vert_units =
+      (tile.mi_row_end - tile.mi_row_start + (1 << shift) - 1) >> shift;
+  return num_vert_units;
+}
+
+static INLINE int get_num_cols(TileInfo tile, int shift) {
+  int num_cols =
+      (tile.mi_col_end - tile.mi_col_start + (1 << shift) - 1) >> shift;
+  return num_cols;
+}
+
+static INLINE int get_level_index(VP9_LEVEL level) {
+  int i;
+  for (i = 0; i < VP9_LEVELS; ++i) {
+    if (level == vp9_level_defs[i].level) return i;
+  }
+  return -1;
+}
+
+// Return the log2 value of max column tiles corresponding to the level that
+// the picture size fits into.
+static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
+                                                   uint32_t height) {
+  int i;
+  const uint32_t pic_size = width * height;
+  const uint32_t pic_breadth = VPXMAX(width, height);
+  for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
+    if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
+        vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
+      return get_msb(vp9_level_defs[i].max_col_tiles);
+    }
+  }
+  return INT_MAX;
+}
+
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
+void vp9_set_row_mt(VP9_COMP *cpi);
+
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
 
 #ifdef __cplusplus
diff --git a/libs/libvpx/vp9/encoder/vp9_ethread.c b/libs/libvpx/vp9/encoder/vp9_ethread.c
index 7657573bbf..0bd2e21451 100644
--- a/libs/libvpx/vp9/encoder/vp9_ethread.c
+++ b/libs/libvpx/vp9/encoder/vp9_ethread.c
@@ -11,6 +11,9 @@
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_multi_thread.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
 static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
@@ -30,13 +33,10 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
             for (n = 0; n < ENTROPY_TOKENS; n++)
               td->rd_counts.coef_counts[i][j][k][l][m][n] +=
                   td_t->rd_counts.coef_counts[i][j][k][l][m][n];
-
-  // Counts of all motion searches and exhuastive mesh searches.
-  td->rd_counts.m_search_count += td_t->rd_counts.m_search_count;
-  td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count;
 }
 
-static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+static int enc_worker_hook(void *arg1, void *unused) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
   VP9_COMP *const cpi = thread_data->cpi;
   const VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
@@ -65,25 +65,29 @@ static int get_max_tile_cols(VP9_COMP *cpi) {
   vp9_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
   log2_tile_cols =
       clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
+  if (cpi->oxcf.target_level == LEVEL_AUTO) {
+    const int level_tile_cols =
+        log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
+    if (log2_tile_cols > level_tile_cols) {
+      log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
+    }
+  }
   return (1 << log2_tile_cols);
 }
 
-void vp9_encode_tiles_mt(VP9_COMP *cpi) {
+static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
   VP9_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-  const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
   int i;
 
-  vp9_init_tile_data(cpi);
-
   // Only run once to create threads and allocate thread data.
   if (cpi->num_workers == 0) {
     int allocated_workers = num_workers;
 
     // While using SVC, we need to allocate threads according to the highest
-    // resolution.
-    if (cpi->use_svc) {
+    // resolution. When row based multithreading is enabled, it is OK to
+    // allocate more threads than the number of max tile columns.
+    if (cpi->use_svc && !cpi->row_mt) {
       int max_tile_cols = get_max_tile_cols(cpi);
       allocated_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols);
     }
@@ -127,19 +131,57 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
         thread_data->cpi = cpi;
         thread_data->td = &cpi->td;
       }
-
       winterface->sync(worker);
     }
   }
+}
+
+static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2,
+                               int num_workers) {
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  int i;
 
   for (i = 0; i < num_workers; i++) {
     VPxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *thread_data;
-
-    worker->hook = (VPxWorkerHook)enc_worker_hook;
+    worker->hook = hook;
     worker->data1 = &cpi->tile_thr_data[i];
-    worker->data2 = NULL;
-    thread_data = (EncWorkerData *)worker->data1;
+    worker->data2 = data2;
+  }
+
+  // Encode a frame
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    if (i == cpi->num_workers - 1)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+
+  // Encoding ends.
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    winterface->sync(worker);
+  }
+}
+
+void vp9_encode_tiles_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
+  int i;
+
+  vp9_init_tile_data(cpi);
+
+  create_enc_workers(cpi, num_workers);
+
+  for (i = 0; i < num_workers; i++) {
+    EncWorkerData *thread_data;
+    thread_data = &cpi->tile_thr_data[i];
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
@@ -169,25 +211,445 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
     }
   }
 
-  // Encode a frame
-  for (i = 0; i < num_workers; i++) {
-    VPxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
-
-    // Set the starting tile for each thread.
-    thread_data->start = i;
-
-    if (i == cpi->num_workers - 1)
-      winterface->execute(worker);
-    else
-      winterface->launch(worker);
-  }
-
-  // Encoding ends.
-  for (i = 0; i < num_workers; i++) {
-    VPxWorker *const worker = &cpi->workers[i];
-    winterface->sync(worker);
-  }
+  launch_enc_workers(cpi, enc_worker_hook, NULL, num_workers);
+
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+    // Accumulate counters.
+    if (i < cpi->num_workers - 1) {
+      vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0);
+      accumulate_rd_opt(&cpi->td, thread_data->td);
+    }
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void accumulate_fp_tile_stat(TileDataEnc *tile_data,
+                                    TileDataEnc *tile_data_t) {
+  tile_data->fp_data.intra_factor += tile_data_t->fp_data.intra_factor;
+  tile_data->fp_data.brightness_factor +=
+      tile_data_t->fp_data.brightness_factor;
+  tile_data->fp_data.coded_error += tile_data_t->fp_data.coded_error;
+  tile_data->fp_data.sr_coded_error += tile_data_t->fp_data.sr_coded_error;
+  tile_data->fp_data.frame_noise_energy +=
+      tile_data_t->fp_data.frame_noise_energy;
+  tile_data->fp_data.intra_error += tile_data_t->fp_data.intra_error;
+  tile_data->fp_data.intercount += tile_data_t->fp_data.intercount;
+  tile_data->fp_data.second_ref_count += tile_data_t->fp_data.second_ref_count;
+  tile_data->fp_data.neutral_count += tile_data_t->fp_data.neutral_count;
+  tile_data->fp_data.intra_count_low += tile_data_t->fp_data.intra_count_low;
+  tile_data->fp_data.intra_count_high += tile_data_t->fp_data.intra_count_high;
+  tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count;
+  tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount;
+  tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr;
+  tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs;
+  tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc;
+  tile_data->fp_data.sum_mvc_abs += tile_data_t->fp_data.sum_mvc_abs;
+  tile_data->fp_data.sum_mvrs += tile_data_t->fp_data.sum_mvrs;
+  tile_data->fp_data.sum_mvcs += tile_data_t->fp_data.sum_mvcs;
+  tile_data->fp_data.sum_in_vectors += tile_data_t->fp_data.sum_in_vectors;
+  tile_data->fp_data.intra_smooth_count +=
+      tile_data_t->fp_data.intra_smooth_count;
+  tile_data->fp_data.image_data_start_row =
+      VPXMIN(tile_data->fp_data.image_data_start_row,
+             tile_data_t->fp_data.image_data_start_row) == INVALID_ROW
+          ? VPXMAX(tile_data->fp_data.image_data_start_row,
+                   tile_data_t->fp_data.image_data_start_row)
+          : VPXMIN(tile_data->fp_data.image_data_start_row,
+                   tile_data_t->fp_data.image_data_start_row);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+// Allocate memory for row synchronization
+void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
+                               int rows) {
+  row_mt_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+  {
+    int i;
+
+    CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,
+                    vpx_malloc(sizeof(*row_mt_sync->mutex_) * rows));
+    if (row_mt_sync->mutex_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, row_mt_sync->cond_,
+                    vpx_malloc(sizeof(*row_mt_sync->cond_) * rows));
+    if (row_mt_sync->cond_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&row_mt_sync->cond_[i], NULL);
+      }
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  CHECK_MEM_ERROR(cm, row_mt_sync->cur_col,
+                  vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows));
+
+  // Set up nsync.
+  row_mt_sync->sync_range = 1;
+}
+
+// Deallocate row based multi-threading synchronization related mutex and data
+void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync) {
+  if (row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+
+    if (row_mt_sync->mutex_ != NULL) {
+      for (i = 0; i < row_mt_sync->rows; ++i) {
+        pthread_mutex_destroy(&row_mt_sync->mutex_[i]);
+      }
+      vpx_free(row_mt_sync->mutex_);
+    }
+    if (row_mt_sync->cond_ != NULL) {
+      for (i = 0; i < row_mt_sync->rows; ++i) {
+        pthread_cond_destroy(&row_mt_sync->cond_[i]);
+      }
+      vpx_free(row_mt_sync->cond_);
+    }
+#endif  // CONFIG_MULTITHREAD
+    vpx_free(row_mt_sync->cur_col);
+    // clear the structure as the source of this call may be dynamic change
+    // in tiles in which case this call will be followed by an _alloc()
+    // which may fail.
+    vp9_zero(*row_mt_sync);
+  }
+}
+
+void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+  const int nsync = row_mt_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > row_mt_sync->cur_col[r - 1] - nsync + 1) {
+      pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c) {
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+  return;
+}
+
+void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,
+                           const int cols) {
+#if CONFIG_MULTITHREAD
+  const int nsync = row_mt_sync->sync_range;
+  int cur;
+  // Only signal when there are enough encoded blocks for next row to run.
+  int sig = 1;
+
+  if (c < cols - 1) {
+    cur = c;
+    if (c % nsync != nsync - 1) sig = 0;
+  } else {
+    cur = cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&row_mt_sync->mutex_[r]);
+
+    row_mt_sync->cur_col[r] = cur;
+
+    pthread_cond_signal(&row_mt_sync->cond_[r]);
+    pthread_mutex_unlock(&row_mt_sync->mutex_[r]);
+  }
+#else
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
+                                 const int cols) {
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+  return;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static int first_pass_worker_hook(EncWorkerData *const thread_data,
+                                  MultiThreadHandle *multi_thread_ctxt) {
+  VP9_COMP *const cpi = thread_data->cpi;
+  const VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  int tile_row, tile_col;
+  TileDataEnc *this_tile;
+  int end_of_frame;
+  int thread_id = thread_data->thread_id;
+  int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+  JobNode *proc_job = NULL;
+  FIRSTPASS_DATA fp_acc_data;
+  MV zero_mv = { 0, 0 };
+  MV best_ref_mv;
+  int mb_row;
+
+  end_of_frame = 0;
+  while (0 == end_of_frame) {
+    // Get the next job in the queue
+    proc_job =
+        (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
+    if (NULL == proc_job) {
+      // Query for the status of other tiles
+      end_of_frame = vp9_get_tiles_proc_status(
+          multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
+          tile_cols);
+    } else {
+      tile_col = proc_job->tile_col_id;
+      tile_row = proc_job->tile_row_id;
+
+      this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+      mb_row = proc_job->vert_unit_row_num;
+
+      best_ref_mv = zero_mv;
+      vp9_zero(fp_acc_data);
+      fp_acc_data.image_data_start_row = INVALID_ROW;
+      vp9_first_pass_encode_tile_mb_row(cpi, thread_data->td, &fp_acc_data,
+                                        this_tile, &best_ref_mv, mb_row);
+    }
+  }
+  return 0;
+}
+
+void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  TileDataEnc *first_tile_col;
+  int num_workers = VPXMAX(cpi->oxcf.max_threads, 1);
+  int i;
+
+  if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
+      multi_thread_ctxt->allocated_tile_rows < tile_rows ||
+      multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
+    vp9_row_mt_mem_dealloc(cpi);
+    vp9_init_tile_data(cpi);
+    vp9_row_mt_mem_alloc(cpi);
+  } else {
+    vp9_init_tile_data(cpi);
+  }
+
+  create_enc_workers(cpi, num_workers);
+
+  vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
+
+  vp9_prepare_job_queue(cpi, FIRST_PASS_JOB);
+
+  vp9_multi_thread_tile_init(cpi);
+
+  for (i = 0; i < num_workers; i++) {
+    EncWorkerData *thread_data;
+    thread_data = &cpi->tile_thr_data[i];
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+    }
+  }
+
+  launch_enc_workers(cpi, (VPxWorkerHook)first_pass_worker_hook,
+                     multi_thread_ctxt, num_workers);
+
+  first_tile_col = &cpi->tile_data[0];
+  for (i = 1; i < tile_cols; i++) {
+    TileDataEnc *this_tile = &cpi->tile_data[i];
+    accumulate_fp_tile_stat(first_tile_col, this_tile);
+  }
+}
+
+static int temporal_filter_worker_hook(EncWorkerData *const thread_data,
+                                       MultiThreadHandle *multi_thread_ctxt) {
+  VP9_COMP *const cpi = thread_data->cpi;
+  const VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  int tile_row, tile_col;
+  int mb_col_start, mb_col_end;
+  TileDataEnc *this_tile;
+  int end_of_frame;
+  int thread_id = thread_data->thread_id;
+  int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+  JobNode *proc_job = NULL;
+  int mb_row;
+
+  end_of_frame = 0;
+  while (0 == end_of_frame) {
+    // Get the next job in the queue
+    proc_job =
+        (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
+    if (NULL == proc_job) {
+      // Query for the status of other tiles
+      end_of_frame = vp9_get_tiles_proc_status(
+          multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
+          tile_cols);
+    } else {
+      tile_col = proc_job->tile_col_id;
+      tile_row = proc_job->tile_row_id;
+      this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+      mb_col_start = (this_tile->tile_info.mi_col_start) >> 1;
+      mb_col_end = (this_tile->tile_info.mi_col_end + 1) >> 1;
+      mb_row = proc_job->vert_unit_row_num;
+
+      vp9_temporal_filter_iterate_row_c(cpi, thread_data->td, mb_row,
+                                        mb_col_start, mb_col_end);
+    }
+  }
+  return 0;
+}
+
+void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int num_workers = cpi->num_workers ? cpi->num_workers : 1;
+  int i;
+
+  if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
+      multi_thread_ctxt->allocated_tile_rows < tile_rows ||
+      multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
+    vp9_row_mt_mem_dealloc(cpi);
+    vp9_init_tile_data(cpi);
+    vp9_row_mt_mem_alloc(cpi);
+  } else {
+    vp9_init_tile_data(cpi);
+  }
+
+  create_enc_workers(cpi, num_workers);
+
+  vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
+
+  vp9_prepare_job_queue(cpi, ARNR_JOB);
+
+  for (i = 0; i < num_workers; i++) {
+    EncWorkerData *thread_data;
+    thread_data = &cpi->tile_thr_data[i];
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+    }
+  }
+
+  launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook,
+                     multi_thread_ctxt, num_workers);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static int enc_row_mt_worker_hook(EncWorkerData *const thread_data,
+                                  MultiThreadHandle *multi_thread_ctxt) {
+  VP9_COMP *const cpi = thread_data->cpi;
+  const VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  int tile_row, tile_col;
+  int end_of_frame;
+  int thread_id = thread_data->thread_id;
+  int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+  JobNode *proc_job = NULL;
+  int mi_row;
+
+  end_of_frame = 0;
+  while (0 == end_of_frame) {
+    // Get the next job in the queue
+    proc_job =
+        (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
+    if (NULL == proc_job) {
+      // Query for the status of other tiles
+      end_of_frame = vp9_get_tiles_proc_status(
+          multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
+          tile_cols);
+    } else {
+      tile_col = proc_job->tile_col_id;
+      tile_row = proc_job->tile_row_id;
+      mi_row = proc_job->vert_unit_row_num * MI_BLOCK_SIZE;
+
+      vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row);
+    }
+  }
+  return 0;
+}
+
+void vp9_encode_tiles_row_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int num_workers = VPXMAX(cpi->oxcf.max_threads, 1);
+  int i;
+
+  if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
+      multi_thread_ctxt->allocated_tile_rows < tile_rows ||
+      multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
+    vp9_row_mt_mem_dealloc(cpi);
+    vp9_init_tile_data(cpi);
+    vp9_row_mt_mem_alloc(cpi);
+  } else {
+    vp9_init_tile_data(cpi);
+  }
+
+  create_enc_workers(cpi, num_workers);
+
+  vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
+
+  vp9_prepare_job_queue(cpi, ENCODE_JOB);
+
+  vp9_multi_thread_tile_init(cpi);
+
+  for (i = 0; i < num_workers; i++) {
+    EncWorkerData *thread_data;
+    thread_data = &cpi->tile_thr_data[i];
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      thread_data->td->rd_counts = cpi->td.rd_counts;
+    }
+    if (thread_data->td->counts != &cpi->common.counts) {
+      memcpy(thread_data->td->counts, &cpi->common.counts,
+             sizeof(cpi->common.counts));
+    }
+
+    // Handle use_nonrd_pick_mode case.
+    if (cpi->sf.use_nonrd_pick_mode) {
+      MACROBLOCK *const x = &thread_data->td->mb;
+      MACROBLOCKD *const xd = &x->e_mbd;
+      struct macroblock_plane *const p = x->plane;
+      struct macroblockd_plane *const pd = xd->plane;
+      PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none;
+      int j;
+
+      for (j = 0; j < MAX_MB_PLANE; ++j) {
+        p[j].coeff = ctx->coeff_pbuf[j][0];
+        p[j].qcoeff = ctx->qcoeff_pbuf[j][0];
+        pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0];
+        p[j].eobs = ctx->eobs_pbuf[j][0];
+      }
+    }
+  }
+
+  launch_enc_workers(cpi, (VPxWorkerHook)enc_row_mt_worker_hook,
+                     multi_thread_ctxt, num_workers);
 
   for (i = 0; i < num_workers; i++) {
     VPxWorker *const worker = &cpi->workers[i];
diff --git a/libs/libvpx/vp9/encoder/vp9_ethread.h b/libs/libvpx/vp9/encoder/vp9_ethread.h
index 1efa4dcde2..a396e621d7 100644
--- a/libs/libvpx/vp9/encoder/vp9_ethread.h
+++ b/libs/libvpx/vp9/encoder/vp9_ethread.h
@@ -15,6 +15,10 @@
 extern "C" {
 #endif
 
+#define MAX_NUM_TILE_COLS (1 << 6)
+#define MAX_NUM_TILE_ROWS 4
+#define MAX_NUM_THREADS 80
+
 struct VP9_COMP;
 struct ThreadData;
 
@@ -22,10 +26,45 @@ typedef struct EncWorkerData {
   struct VP9_COMP *cpi;
   struct ThreadData *td;
   int start;
+  int thread_id;
+  int tile_completion_status[MAX_NUM_TILE_COLS];
 } EncWorkerData;
 
+// Encoder row synchronization
+typedef struct VP9RowMTSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  // Allocate memory to store the sb/mb block index in each row.
+  int *cur_col;
+  int sync_range;
+  int rows;
+} VP9RowMTSync;
+
 void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
 
+void vp9_encode_tiles_row_mt(struct VP9_COMP *cpi);
+
+void vp9_encode_fp_row_mt(struct VP9_COMP *cpi);
+
+void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c);
+void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,
+                           const int cols);
+
+void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c);
+void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
+                                 const int cols);
+
+// Allocate memory for row based multi-threading synchronization.
+void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, struct VP9Common *cm,
+                               int rows);
+
+// Deallocate row based multi-threading synchronization related mutex and data.
+void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync);
+
+void vp9_temporal_filter_row_mt(struct VP9_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libs/libvpx/vp9/encoder/vp9_firstpass.c b/libs/libvpx/vp9/encoder/vp9_firstpass.c
index 1c716a1028..fb6b132a5b 100644
--- a/libs/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libs/libvpx/vp9/encoder/vp9_firstpass.c
@@ -31,6 +31,7 @@
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
@@ -40,20 +41,13 @@
 
 #define OUTPUT_FPF 0
 #define ARF_STATS_OUTPUT 0
+#define COMPLEXITY_STATS_OUTPUT 0
 
-#define BOOST_BREAKOUT 12.5
-#define BOOST_FACTOR 12.5
-#define FACTOR_PT_LOW 0.70
-#define FACTOR_PT_HIGH 0.90
 #define FIRST_PASS_Q 10.0
-#define GF_MAX_BOOST 96.0
 #define INTRA_MODE_PENALTY 1024
-#define KF_MAX_BOOST 128.0
 #define MIN_ARF_GF_BOOST 240
 #define MIN_DECAY_FACTOR 0.01
-#define MIN_KF_BOOST 300
 #define NEW_MV_MODE_PENALTY 32
-#define SVC_FACTOR_PT_LOW 0.45
 #define DARK_THRESH 64
 #define DEFAULT_GRP_WEIGHT 1.0
 #define RC_FACTOR_MIN 0.75
@@ -109,18 +103,19 @@ static void output_stats(FIRSTPASS_STATS *stats,
     fpfile = fopen("firstpass.stt", "a");
 
     fprintf(fpfile,
-            "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.0lf %12.4lf"
+            "%12.0lf %12.4lf %12.2lf %12.2lf %12.2lf %12.0lf %12.4lf %12.4lf"
             "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
-            "%12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf"
+            "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf"
+            "%12.4lf"
             "\n",
             stats->frame, stats->weight, stats->intra_error, stats->coded_error,
             stats->sr_coded_error, stats->frame_noise_energy, stats->pcnt_inter,
             stats->pcnt_motion, stats->pcnt_second_ref, stats->pcnt_neutral,
+            stats->pcnt_intra_low, stats->pcnt_intra_high,
             stats->intra_skip_pct, stats->intra_smooth_pct,
             stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr,
             stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv,
-            stats->MVcv, stats->mv_in_out_count, stats->new_mv_count,
-            stats->count, stats->duration);
+            stats->MVcv, stats->mv_in_out_count, stats->count, stats->duration);
     fclose(fpfile);
   }
 #endif
@@ -150,6 +145,8 @@ static void zero_stats(FIRSTPASS_STATS *section) {
   section->pcnt_neutral = 0.0;
   section->intra_skip_pct = 0.0;
   section->intra_smooth_pct = 0.0;
+  section->pcnt_intra_low = 0.0;
+  section->pcnt_intra_high = 0.0;
   section->inactive_zone_rows = 0.0;
   section->inactive_zone_cols = 0.0;
   section->MVr = 0.0;
@@ -159,7 +156,6 @@ static void zero_stats(FIRSTPASS_STATS *section) {
   section->MVrv = 0.0;
   section->MVcv = 0.0;
   section->mv_in_out_count = 0.0;
-  section->new_mv_count = 0.0;
   section->count = 0.0;
   section->duration = 1.0;
   section->spatial_layer_id = 0;
@@ -180,6 +176,8 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
   section->pcnt_neutral += frame->pcnt_neutral;
   section->intra_skip_pct += frame->intra_skip_pct;
   section->intra_smooth_pct += frame->intra_smooth_pct;
+  section->pcnt_intra_low += frame->pcnt_intra_low;
+  section->pcnt_intra_high += frame->pcnt_intra_high;
   section->inactive_zone_rows += frame->inactive_zone_rows;
   section->inactive_zone_cols += frame->inactive_zone_cols;
   section->MVr += frame->MVr;
@@ -189,7 +187,6 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
   section->MVrv += frame->MVrv;
   section->MVcv += frame->MVcv;
   section->mv_in_out_count += frame->mv_in_out_count;
-  section->new_mv_count += frame->new_mv_count;
   section->count += frame->count;
   section->duration += frame->duration;
 }
@@ -208,6 +205,8 @@ static void subtract_stats(FIRSTPASS_STATS *section,
   section->pcnt_neutral -= frame->pcnt_neutral;
   section->intra_skip_pct -= frame->intra_skip_pct;
   section->intra_smooth_pct -= frame->intra_smooth_pct;
+  section->pcnt_intra_low -= frame->pcnt_intra_low;
+  section->pcnt_intra_high -= frame->pcnt_intra_high;
   section->inactive_zone_rows -= frame->inactive_zone_rows;
   section->inactive_zone_cols -= frame->inactive_zone_cols;
   section->MVr -= frame->MVr;
@@ -217,7 +216,6 @@ static void subtract_stats(FIRSTPASS_STATS *section,
   section->MVrv -= frame->MVrv;
   section->MVcv -= frame->MVcv;
   section->mv_in_out_count -= frame->mv_in_out_count;
-  section->new_mv_count -= frame->new_mv_count;
   section->count -= frame->count;
   section->duration -= frame->duration;
 }
@@ -237,17 +235,26 @@ static double calculate_active_area(const VP9_COMP *cpi,
   return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
 }
 
+// Get the average weighted error for the clip (or corpus)
+static double get_distribution_av_err(VP9_COMP *cpi, TWO_PASS *const twopass) {
+  const double av_weight =
+      twopass->total_stats.weight / twopass->total_stats.count;
+
+  if (cpi->oxcf.vbr_corpus_complexity)
+    return av_weight * twopass->mean_mod_score;
+  else
+    return (twopass->total_stats.coded_error * av_weight) /
+           twopass->total_stats.count;
+}
+
+#define ACT_AREA_CORRECTION 0.5
 // Calculate a modified Error used in distributing bits between easier and
 // harder frames.
-#define ACT_AREA_CORRECTION 0.5
-static double calculate_modified_err(const VP9_COMP *cpi,
-                                     const TWO_PASS *twopass,
-                                     const VP9EncoderConfig *oxcf,
-                                     const FIRSTPASS_STATS *this_frame) {
-  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
-  const double av_weight = stats->weight / stats->count;
-  const double av_err = (stats->coded_error * av_weight) / stats->count;
-  double modified_error =
+static double calculate_mod_frame_score(const VP9_COMP *cpi,
+                                        const VP9EncoderConfig *oxcf,
+                                        const FIRSTPASS_STATS *this_frame,
+                                        const double av_err) {
+  double modified_score =
       av_err * pow(this_frame->coded_error * this_frame->weight /
                        DOUBLE_DIVIDE_CHECK(av_err),
                    oxcf->two_pass_vbrbias / 100.0);
@@ -257,11 +264,37 @@ static double calculate_modified_err(const VP9_COMP *cpi,
   // remaining active MBs. The correction here assumes that coding
   // 0.5N blocks of complexity 2X is a little easier than coding N
   // blocks of complexity X.
-  modified_error *=
+  modified_score *=
       pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
 
-  return fclamp(modified_error, twopass->modified_error_min,
-                twopass->modified_error_max);
+  return modified_score;
+}
+
+static double calculate_norm_frame_score(const VP9_COMP *cpi,
+                                         const TWO_PASS *twopass,
+                                         const VP9EncoderConfig *oxcf,
+                                         const FIRSTPASS_STATS *this_frame,
+                                         const double av_err) {
+  double modified_score =
+      av_err * pow(this_frame->coded_error * this_frame->weight /
+                       DOUBLE_DIVIDE_CHECK(av_err),
+                   oxcf->two_pass_vbrbias / 100.0);
+
+  const double min_score = (double)(oxcf->two_pass_vbrmin_section) / 100.0;
+  const double max_score = (double)(oxcf->two_pass_vbrmax_section) / 100.0;
+
+  // Correction for active area. Frames with a reduced active area
+  // (eg due to formatting bars) have a higher error per mb for the
+  // remaining active MBs. The correction here assumes that coding
+  // 0.5N blocks of complexity 2X is a little easier than coding N
+  // blocks of complexity X.
+  modified_score *=
+      pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+
+  // Normalize to a midpoint score.
+  modified_score /= DOUBLE_DIVIDE_CHECK(twopass->mean_mod_score);
+
+  return fclamp(modified_score, min_score, max_score);
 }
 
 // This function returns the maximum target rate per frame.
@@ -292,6 +325,9 @@ void vp9_end_first_pass(VP9_COMP *cpi) {
   } else {
     output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
   }
+
+  vpx_free(cpi->twopass.fp_mb_float_stats);
+  cpi->twopass.fp_mb_float_stats = NULL;
 }
 
 static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
@@ -466,8 +502,8 @@ static int scale_sse_threshold(VP9_COMMON *cm, int thresh) {
   if (cm->use_highbitdepth) {
     switch (cm->bit_depth) {
       case VPX_BITS_8: ret_val = thresh; break;
-      case VPX_BITS_10: ret_val = thresh >> 4; break;
-      case VPX_BITS_12: ret_val = thresh >> 8; break;
+      case VPX_BITS_10: ret_val = thresh << 4; break;
+      case VPX_BITS_12: ret_val = thresh << 8; break;
       default:
         assert(0 &&
                "cm->bit_depth should be VPX_BITS_8, "
@@ -652,39 +688,160 @@ static int fp_estimate_block_noise(MACROBLOCK *x, BLOCK_SIZE bsize) {
   return block_noise << 2;  // Scale << 2 to account for sampling.
 }
 
-#define INVALID_ROW -1
-void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
+// This function is called to test the functionality of row based
+// multi-threading in unit tests for bit-exactness
+static void accumulate_floating_point_stats(VP9_COMP *cpi,
+                                            TileDataEnc *first_tile_col) {
+  VP9_COMMON *const cm = &cpi->common;
   int mb_row, mb_col;
-  MACROBLOCK *const x = &cpi->td.mb;
+  first_tile_col->fp_data.intra_factor = 0;
+  first_tile_col->fp_data.brightness_factor = 0;
+  first_tile_col->fp_data.neutral_count = 0;
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+      const int mb_index = mb_row * cm->mb_cols + mb_col;
+      first_tile_col->fp_data.intra_factor +=
+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor;
+      first_tile_col->fp_data.brightness_factor +=
+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor;
+      first_tile_col->fp_data.neutral_count +=
+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count;
+    }
+  }
+}
+
+static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
+                                 FIRSTPASS_DATA *fp_acc_data) {
+  VP9_COMMON *const cm = &cpi->common;
+  // The minimum error here insures some bit allocation to frames even
+  // in static regions. The allocation per MB declines for larger formats
+  // where the typical "real" energy per MB also falls.
+  // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+  // number of mbs is proportional to the image area.
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                             : cpi->common.MBs;
+  const double min_err = 200 * sqrt(num_mbs);
+
+  // Clamp the image start to rows/2. This number of rows is discarded top
+  // and bottom as dead data so rows / 2 means the frame is blank.
+  if ((fp_acc_data->image_data_start_row > cm->mb_rows / 2) ||
+      (fp_acc_data->image_data_start_row == INVALID_ROW)) {
+    fp_acc_data->image_data_start_row = cm->mb_rows / 2;
+  }
+  // Exclude any image dead zone
+  if (fp_acc_data->image_data_start_row > 0) {
+    fp_acc_data->intra_skip_count =
+        VPXMAX(0,
+               fp_acc_data->intra_skip_count -
+                   (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
+  }
+
+  fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs;
+  fp_acc_data->brightness_factor =
+      fp_acc_data->brightness_factor / (double)num_mbs;
+  fps->weight = fp_acc_data->intra_factor * fp_acc_data->brightness_factor;
+
+  fps->frame = cm->current_video_frame;
+  fps->spatial_layer_id = cpi->svc.spatial_layer_id;
+
+  fps->coded_error =
+      ((double)(fp_acc_data->coded_error >> 8) + min_err) / num_mbs;
+  fps->sr_coded_error =
+      ((double)(fp_acc_data->sr_coded_error >> 8) + min_err) / num_mbs;
+  fps->intra_error =
+      ((double)(fp_acc_data->intra_error >> 8) + min_err) / num_mbs;
+
+  fps->frame_noise_energy =
+      (double)(fp_acc_data->frame_noise_energy) / (double)num_mbs;
+  fps->count = 1.0;
+  fps->pcnt_inter = (double)(fp_acc_data->intercount) / num_mbs;
+  fps->pcnt_second_ref = (double)(fp_acc_data->second_ref_count) / num_mbs;
+  fps->pcnt_neutral = (double)(fp_acc_data->neutral_count) / num_mbs;
+  fps->pcnt_intra_low = (double)(fp_acc_data->intra_count_low) / num_mbs;
+  fps->pcnt_intra_high = (double)(fp_acc_data->intra_count_high) / num_mbs;
+  fps->intra_skip_pct = (double)(fp_acc_data->intra_skip_count) / num_mbs;
+  fps->intra_smooth_pct = (double)(fp_acc_data->intra_smooth_count) / num_mbs;
+  fps->inactive_zone_rows = (double)(fp_acc_data->image_data_start_row);
+  // Currently set to 0 as most issues relate to letter boxing.
+  fps->inactive_zone_cols = (double)0;
+
+  if (fp_acc_data->mvcount > 0) {
+    fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount;
+    fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount;
+    fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount;
+    fps->mvc_abs = (double)(fp_acc_data->sum_mvc_abs) / fp_acc_data->mvcount;
+    fps->MVrv = ((double)(fp_acc_data->sum_mvrs) -
+                 ((double)(fp_acc_data->sum_mvr) * (fp_acc_data->sum_mvr) /
+                  fp_acc_data->mvcount)) /
+                fp_acc_data->mvcount;
+    fps->MVcv = ((double)(fp_acc_data->sum_mvcs) -
+                 ((double)(fp_acc_data->sum_mvc) * (fp_acc_data->sum_mvc) /
+                  fp_acc_data->mvcount)) /
+                fp_acc_data->mvcount;
+    fps->mv_in_out_count =
+        (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2);
+    fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs;
+  } else {
+    fps->MVr = 0.0;
+    fps->mvr_abs = 0.0;
+    fps->MVc = 0.0;
+    fps->mvc_abs = 0.0;
+    fps->MVrv = 0.0;
+    fps->MVcv = 0.0;
+    fps->mv_in_out_count = 0.0;
+    fps->pcnt_motion = 0.0;
+  }
+}
+
+static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile,
+                                      FIRSTPASS_DATA *fp_acc_data) {
+  this_tile->fp_data.intra_factor += fp_acc_data->intra_factor;
+  this_tile->fp_data.brightness_factor += fp_acc_data->brightness_factor;
+  this_tile->fp_data.coded_error += fp_acc_data->coded_error;
+  this_tile->fp_data.sr_coded_error += fp_acc_data->sr_coded_error;
+  this_tile->fp_data.frame_noise_energy += fp_acc_data->frame_noise_energy;
+  this_tile->fp_data.intra_error += fp_acc_data->intra_error;
+  this_tile->fp_data.intercount += fp_acc_data->intercount;
+  this_tile->fp_data.second_ref_count += fp_acc_data->second_ref_count;
+  this_tile->fp_data.neutral_count += fp_acc_data->neutral_count;
+  this_tile->fp_data.intra_count_low += fp_acc_data->intra_count_low;
+  this_tile->fp_data.intra_count_high += fp_acc_data->intra_count_high;
+  this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count;
+  this_tile->fp_data.mvcount += fp_acc_data->mvcount;
+  this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr;
+  this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs;
+  this_tile->fp_data.sum_mvc += fp_acc_data->sum_mvc;
+  this_tile->fp_data.sum_mvc_abs += fp_acc_data->sum_mvc_abs;
+  this_tile->fp_data.sum_mvrs += fp_acc_data->sum_mvrs;
+  this_tile->fp_data.sum_mvcs += fp_acc_data->sum_mvcs;
+  this_tile->fp_data.sum_in_vectors += fp_acc_data->sum_in_vectors;
+  this_tile->fp_data.intra_smooth_count += fp_acc_data->intra_smooth_count;
+  this_tile->fp_data.image_data_start_row =
+      VPXMIN(this_tile->fp_data.image_data_start_row,
+             fp_acc_data->image_data_start_row) == INVALID_ROW
+          ? VPXMAX(this_tile->fp_data.image_data_start_row,
+                   fp_acc_data->image_data_start_row)
+          : VPXMIN(this_tile->fp_data.image_data_start_row,
+                   fp_acc_data->image_data_start_row);
+}
+
+void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
+                                       FIRSTPASS_DATA *fp_acc_data,
+                                       TileDataEnc *tile_data, MV *best_ref_mv,
+                                       int mb_row) {
+  int mb_col;
+  MACROBLOCK *const x = &td->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  TileInfo tile;
+  TileInfo tile = tile_data->tile_info;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
-  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
-  int i;
+  const PICK_MODE_CONTEXT *ctx = &td->pc_root->none;
+  int i, c;
+  int num_mb_cols = get_num_cols(tile_data->tile_info, 1);
 
   int recon_yoffset, recon_uvoffset;
-  int64_t intra_error = 0;
-  int64_t coded_error = 0;
-  int64_t sr_coded_error = 0;
-  int64_t frame_noise_energy = 0;
-
-  int sum_mvr = 0, sum_mvc = 0;
-  int sum_mvr_abs = 0, sum_mvc_abs = 0;
-  int64_t sum_mvrs = 0, sum_mvcs = 0;
-  int mvcount = 0;
-  int intercount = 0;
-  int second_ref_count = 0;
   const int intrapenalty = INTRA_MODE_PENALTY;
-  double neutral_count;
-  int intra_skip_count = 0;
-  int intra_smooth_count = 0;
-  int image_data_start_row = INVALID_ROW;
-  int new_mv_count = 0;
-  int sum_in_vectors = 0;
-  MV lastmv = { 0, 0 };
-  TWO_PASS *twopass = &cpi->twopass;
   const MV zero_mv = { 0, 0 };
   int recon_y_stride, recon_uv_stride, uv_mb_height;
 
@@ -696,11 +853,537 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   LAYER_CONTEXT *const lc =
       is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id]
                            : NULL;
-  double intra_factor;
-  double brightness_factor;
-  BufferPool *const pool = cm->buffer_pool;
   MODE_INFO mi_above, mi_left;
 
+  double mb_intra_factor;
+  double mb_brightness_factor;
+  double mb_neutral_count;
+
+  // First pass code requires valid last and new frame buffers.
+  assert(new_yv12 != NULL);
+  assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
+
+  if (lc != NULL) {
+    // Use either last frame or alt frame for motion search.
+    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+      first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
+      if (first_ref_buf == NULL)
+        first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);
+    }
+
+    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+      gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+      if (gld_yv12 == NULL) {
+        gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+      }
+    } else {
+      gld_yv12 = NULL;
+    }
+  }
+
+  xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) +
+           (tile.mi_col_start >> 1);
+  xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + (tile.mi_col_start >> 1);
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    p[i].eobs = ctx->eobs_pbuf[i][1];
+  }
+
+  recon_y_stride = new_yv12->y_stride;
+  recon_uv_stride = new_yv12->uv_stride;
+  uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
+
+  // Reset above block coeffs.
+  recon_yoffset =
+      (mb_row * recon_y_stride * 16) + (tile.mi_col_start >> 1) * 16;
+  recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height) +
+                   (tile.mi_col_start >> 1) * uv_mb_height;
+
+  // Set up limit values for motion vectors to prevent them extending
+  // outside the UMV borders.
+  x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
+  x->mv_limits.row_max =
+      ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
+
+  for (mb_col = tile.mi_col_start >> 1, c = 0; mb_col < (tile.mi_col_end >> 1);
+       ++mb_col, c++) {
+    int this_error;
+    int this_intra_error;
+    const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+    const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+    double log_intra;
+    int level_sample;
+    const int mb_index = mb_row * cm->mb_cols + mb_col;
+
+#if CONFIG_FP_MB_STATS
+    const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
+
+    (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c);
+
+    // Adjust to the next column of MBs.
+    x->plane[0].src.buf = cpi->Source->y_buffer +
+                          mb_row * 16 * x->plane[0].src.stride + mb_col * 16;
+    x->plane[1].src.buf = cpi->Source->u_buffer +
+                          mb_row * uv_mb_height * x->plane[1].src.stride +
+                          mb_col * uv_mb_height;
+    x->plane[2].src.buf = cpi->Source->v_buffer +
+                          mb_row * uv_mb_height * x->plane[1].src.stride +
+                          mb_col * uv_mb_height;
+
+    vpx_clear_system_state();
+
+    xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
+    xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
+    xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
+    xd->mi[0]->sb_type = bsize;
+    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+    set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize],
+                   mb_col << 1, num_8x8_blocks_wide_lookup[bsize], cm->mi_rows,
+                   cm->mi_cols);
+    // Are edges available for intra prediction?
+    // Since the firstpass does not populate the mi_grid_visible,
+    // above_mi/left_mi must be overwritten with a nonzero value when edges
+    // are available.  Required by vp9_predict_intra_block().
+    xd->above_mi = (mb_row != 0) ? &mi_above : NULL;
+    xd->left_mi = ((mb_col << 1) > tile.mi_col_start) ? &mi_left : NULL;
+
+    // Do intra 16x16 prediction.
+    x->skip_encode = 0;
+    x->fp_src_pred = 0;
+    // Do intra prediction based on source pixels for tile boundaries
+    if ((mb_col == (tile.mi_col_start >> 1)) && mb_col != 0) {
+      xd->left_mi = &mi_left;
+      x->fp_src_pred = 1;
+    }
+    xd->mi[0]->mode = DC_PRED;
+    xd->mi[0]->tx_size =
+        use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
+    // Fix - zero the 16x16 block first. This ensures correct this_error for
+    // block sizes smaller than 16x16.
+    vp9_zero_array(x->plane[0].src_diff, 256);
+    vp9_encode_intra_block_plane(x, bsize, 0, 0);
+    this_error = vpx_get_mb_ss(x->plane[0].src_diff);
+    this_intra_error = this_error;
+
+    // Keep a record of blocks that have very low intra error residual
+    // (i.e. are in effect completely flat and untextured in the intra
+    // domain). In natural videos this is uncommon, but it is much more
+    // common in animations, graphics and screen content, so may be used
+    // as a signal to detect these types of content.
+    if (this_error < get_ul_intra_threshold(cm)) {
+      ++(fp_acc_data->intra_skip_count);
+    } else if ((mb_col > 0) &&
+               (fp_acc_data->image_data_start_row == INVALID_ROW)) {
+      fp_acc_data->image_data_start_row = mb_row;
+    }
+
+    // Blocks that are mainly smooth in the intra domain.
+    // Some special accounting for CQ but also these are better for testing
+    // noise levels.
+    if (this_error < get_smooth_intra_threshold(cm)) {
+      ++(fp_acc_data->intra_smooth_count);
+    }
+
+    // Special case noise measurement for first frame.
+    if (cm->current_video_frame == 0) {
+      if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
+        fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+      } else {
+        fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+      }
+    }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      switch (cm->bit_depth) {
+        case VPX_BITS_8: break;
+        case VPX_BITS_10: this_error >>= 4; break;
+        case VPX_BITS_12: this_error >>= 8; break;
+        default:
+          assert(0 &&
+                 "cm->bit_depth should be VPX_BITS_8, "
+                 "VPX_BITS_10 or VPX_BITS_12");
+          return;
+      }
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    vpx_clear_system_state();
+    log_intra = log(this_error + 1.0);
+    if (log_intra < 10.0) {
+      mb_intra_factor = 1.0 + ((10.0 - log_intra) * 0.05);
+      fp_acc_data->intra_factor += mb_intra_factor;
+      if (cpi->row_mt_bit_exact)
+        cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor =
+            mb_intra_factor;
+    } else {
+      fp_acc_data->intra_factor += 1.0;
+      if (cpi->row_mt_bit_exact)
+        cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = 1.0;
+    }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth)
+      level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+    else
+      level_sample = x->plane[0].src.buf[0];
+#else
+    level_sample = x->plane[0].src.buf[0];
+#endif
+    if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {
+      mb_brightness_factor = 1.0 + (0.01 * (DARK_THRESH - level_sample));
+      fp_acc_data->brightness_factor += mb_brightness_factor;
+      if (cpi->row_mt_bit_exact)
+        cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor =
+            mb_brightness_factor;
+    } else {
+      fp_acc_data->brightness_factor += 1.0;
+      if (cpi->row_mt_bit_exact)
+        cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor =
+            1.0;
+    }
+
+    // Intrapenalty below deals with situations where the intra and inter
+    // error scores are very low (e.g. a plain black frame).
+    // We do not have special cases in first pass for 0,0 and nearest etc so
+    // all inter modes carry an overhead cost estimate for the mv.
+    // When the error score is very low this causes us to pick all or lots of
+    // INTRA modes and throw lots of key frames.
+    // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+    this_error += intrapenalty;
+
+    // Accumulate the intra error.
+    fp_acc_data->intra_error += (int64_t)this_error;
+
+#if CONFIG_FP_MB_STATS
+    if (cpi->use_fp_mb_stats) {
+      // initialization
+      cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+    }
+#endif
+
+    // Set up limit values for motion vectors to prevent them extending
+    // outside the UMV borders.
+    x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
+    x->mv_limits.col_max =
+        ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
+
+    // Other than for the first frame do a motion search.
+    if ((lc == NULL && cm->current_video_frame > 0) ||
+        (lc != NULL && lc->current_video_frame_in_layer > 0)) {
+      int tmp_err, motion_error, raw_motion_error;
+      // Assume 0,0 motion with no mv overhead.
+      MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
+      struct buf_2d unscaled_last_source_buf_2d;
+
+      xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        motion_error = highbd_get_prediction_error(
+            bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+      } else {
+        motion_error =
+            get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+      }
+#else
+      motion_error =
+          get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      // Compute the motion error of the 0,0 motion using the last source
+      // frame as the reference. Skip the further motion search on
+      // reconstructed frame if this error is small.
+      unscaled_last_source_buf_2d.buf =
+          cpi->unscaled_last_source->y_buffer + recon_yoffset;
+      unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        raw_motion_error = highbd_get_prediction_error(
+            bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+      } else {
+        raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                &unscaled_last_source_buf_2d);
+      }
+#else
+      raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                              &unscaled_last_source_buf_2d);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      // TODO(pengchong): Replace the hard-coded threshold
+      if (raw_motion_error > 25 || lc != NULL) {
+        // Test last reference frame using the previous best mv as the
+        // starting point (best reference) for the search.
+        first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error);
+
+        // If the current best reference mv is not centered on 0,0 then do a
+        // 0,0 based search as well.
+        if (!is_zero_mv(best_ref_mv)) {
+          tmp_err = INT_MAX;
+          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
+
+          if (tmp_err < motion_error) {
+            motion_error = tmp_err;
+            mv = tmp_mv;
+          }
+        }
+
+        // Search in an older reference frame.
+        if (((lc == NULL && cm->current_video_frame > 1) ||
+             (lc != NULL && lc->current_video_frame_in_layer > 1)) &&
+            gld_yv12 != NULL) {
+          // Assume 0,0 motion with no mv overhead.
+          int gf_motion_error;
+
+          xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            gf_motion_error = highbd_get_prediction_error(
+                bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+          } else {
+            gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                   &xd->plane[0].pre[0]);
+          }
+#else
+          gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                 &xd->plane[0].pre[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error);
+
+          if (gf_motion_error < motion_error && gf_motion_error < this_error)
+            ++(fp_acc_data->second_ref_count);
+
+          // Reset to last frame as reference buffer.
+          xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+          xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+          xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+          // In accumulating a score for the older reference frame take the
+          // best of the motion predicted score and the intra coded error
+          // (just as will be done for) accumulation of "coded_error" for
+          // the last frame.
+          if (gf_motion_error < this_error)
+            fp_acc_data->sr_coded_error += gf_motion_error;
+          else
+            fp_acc_data->sr_coded_error += this_error;
+        } else {
+          fp_acc_data->sr_coded_error += motion_error;
+        }
+      } else {
+        fp_acc_data->sr_coded_error += motion_error;
+      }
+
+      // Start by assuming that intra mode is best.
+      best_ref_mv->row = 0;
+      best_ref_mv->col = 0;
+
+#if CONFIG_FP_MB_STATS
+      if (cpi->use_fp_mb_stats) {
+        // intra prediction statistics
+        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+        cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+        cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+        if (this_error > FPMB_ERROR_LARGE_TH) {
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
+        } else if (this_error < FPMB_ERROR_SMALL_TH) {
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
+        }
+      }
+#endif
+
+      if (motion_error <= this_error) {
+        vpx_clear_system_state();
+
+        // Keep a count of cases where the inter and intra were very close
+        // and very low. This helps with scene cut detection for example in
+        // cropped clips with black bars at the sides or top and bottom.
+        if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
+            (this_error < (2 * intrapenalty))) {
+          fp_acc_data->neutral_count += 1.0;
+          if (cpi->row_mt_bit_exact)
+            cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count =
+                1.0;
+          // Also track cases where the intra is not much worse than the inter
+          // and use this in limiting the GF/arf group length.
+        } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+                   (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+          mb_neutral_count =
+              (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
+          fp_acc_data->neutral_count += mb_neutral_count;
+          if (cpi->row_mt_bit_exact)
+            cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count =
+                mb_neutral_count;
+        }
+
+        mv.row *= 8;
+        mv.col *= 8;
+        this_error = motion_error;
+        xd->mi[0]->mode = NEWMV;
+        xd->mi[0]->mv[0].as_mv = mv;
+        xd->mi[0]->tx_size = TX_4X4;
+        xd->mi[0]->ref_frame[0] = LAST_FRAME;
+        xd->mi[0]->ref_frame[1] = NONE;
+        vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
+        vp9_encode_sby_pass1(x, bsize);
+        fp_acc_data->sum_mvr += mv.row;
+        fp_acc_data->sum_mvr_abs += abs(mv.row);
+        fp_acc_data->sum_mvc += mv.col;
+        fp_acc_data->sum_mvc_abs += abs(mv.col);
+        fp_acc_data->sum_mvrs += mv.row * mv.row;
+        fp_acc_data->sum_mvcs += mv.col * mv.col;
+        ++(fp_acc_data->intercount);
+
+        *best_ref_mv = mv;
+
+#if CONFIG_FP_MB_STATS
+        if (cpi->use_fp_mb_stats) {
+          // inter prediction statistics
+          cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+          cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+          if (this_error > FPMB_ERROR_LARGE_TH) {
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
+          } else if (this_error < FPMB_ERROR_SMALL_TH) {
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
+          }
+        }
+#endif
+
+        if (!is_zero_mv(&mv)) {
+          ++(fp_acc_data->mvcount);
+
+#if CONFIG_FP_MB_STATS
+          if (cpi->use_fp_mb_stats) {
+            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_MOTION_ZERO_MASK;
+            // check estimated motion direction
+            if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {
+              // right direction
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                  FPMB_MOTION_RIGHT_MASK;
+            } else if (mv.as_mv.row < 0 &&
+                       abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {
+              // up direction
+              cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_UP_MASK;
+            } else if (mv.as_mv.col < 0 &&
+                       abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {
+              // left direction
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                  FPMB_MOTION_LEFT_MASK;
+            } else {
+              // down direction
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                  FPMB_MOTION_DOWN_MASK;
+            }
+          }
+#endif
+
+          // Does the row vector point inwards or outwards?
+          if (mb_row < cm->mb_rows / 2) {
+            if (mv.row > 0)
+              --(fp_acc_data->sum_in_vectors);
+            else if (mv.row < 0)
+              ++(fp_acc_data->sum_in_vectors);
+          } else if (mb_row > cm->mb_rows / 2) {
+            if (mv.row > 0)
+              ++(fp_acc_data->sum_in_vectors);
+            else if (mv.row < 0)
+              --(fp_acc_data->sum_in_vectors);
+          }
+
+          // Does the col vector point inwards or outwards?
+          if (mb_col < cm->mb_cols / 2) {
+            if (mv.col > 0)
+              --(fp_acc_data->sum_in_vectors);
+            else if (mv.col < 0)
+              ++(fp_acc_data->sum_in_vectors);
+          } else if (mb_col > cm->mb_cols / 2) {
+            if (mv.col > 0)
+              ++(fp_acc_data->sum_in_vectors);
+            else if (mv.col < 0)
+              --(fp_acc_data->sum_in_vectors);
+          }
+          fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+        } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
+          fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+        } else {  // 0,0 mv but high error
+          fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+        }
+      } else {  // Intra < inter error
+        int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH);
+        if (this_intra_error < scaled_low_intra_thresh) {
+          fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+          if (motion_error < scaled_low_intra_thresh) {
+            fp_acc_data->intra_count_low += 1.0;
+          } else {
+            fp_acc_data->intra_count_high += 1.0;
+          }
+        } else {
+          fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+          fp_acc_data->intra_count_high += 1.0;
+        }
+      }
+    } else {
+      fp_acc_data->sr_coded_error += (int64_t)this_error;
+    }
+    fp_acc_data->coded_error += (int64_t)this_error;
+
+    recon_yoffset += 16;
+    recon_uvoffset += uv_mb_height;
+
+    // Accumulate row level stats to the corresponding tile stats
+    if (cpi->row_mt && mb_col == (tile.mi_col_end >> 1) - 1)
+      accumulate_fp_mb_row_stat(tile_data, fp_acc_data);
+
+    (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, mb_row, c,
+                                    num_mb_cols);
+  }
+  vpx_clear_system_state();
+}
+
+static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mb_row;
+  TileDataEnc tile_data;
+  TileInfo *tile = &tile_data.tile_info;
+  MV zero_mv = { 0, 0 };
+  MV best_ref_mv;
+  // Tiling is ignored in the first pass.
+  vp9_tile_init(tile, cm, 0, 0);
+
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+    best_ref_mv = zero_mv;
+    vp9_first_pass_encode_tile_mb_row(cpi, &cpi->td, fp_acc_data, &tile_data,
+                                      &best_ref_mv, mb_row);
+  }
+}
+
+void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TWO_PASS *twopass = &cpi->twopass;
+
+  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
+  const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+
+  LAYER_CONTEXT *const lc =
+      is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id]
+                           : NULL;
+  BufferPool *const pool = cm->buffer_pool;
+
+  FIRSTPASS_DATA fp_temp_data;
+  FIRSTPASS_DATA *fp_acc_data = &fp_temp_data;
+
+  vpx_clear_system_state();
+  vp9_zero(fp_temp_data);
+  fp_acc_data->image_data_start_row = INVALID_ROW;
+
   // First pass code requires valid last and new frame buffers.
   assert(new_yv12 != NULL);
   assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
@@ -711,12 +1394,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   }
 #endif
 
-  vpx_clear_system_state();
-
-  intra_factor = 0.0;
-  brightness_factor = 0.0;
-  neutral_count = 0.0;
-
   set_first_pass_params(cpi);
   vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
 
@@ -761,7 +1438,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
                  (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
 
     cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
-                                        &cpi->scaled_source, 0);
+                                        &cpi->scaled_source, 0, EIGHTTAP, 0);
   }
 
   vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
@@ -778,501 +1455,39 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
 
   vp9_frame_init_quantizer(cpi);
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    p[i].coeff = ctx->coeff_pbuf[i][1];
-    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
-    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
-    p[i].eobs = ctx->eobs_pbuf[i][1];
-  }
   x->skip_recode = 0;
 
   vp9_init_mv_probs(cm);
   vp9_initialize_rd_consts(cpi);
 
-  // Tiling is ignored in the first pass.
-  vp9_tile_init(&tile, cm, 0, 0);
+  cm->log2_tile_rows = 0;
 
-  recon_y_stride = new_yv12->y_stride;
-  recon_uv_stride = new_yv12->uv_stride;
-  uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
-
-  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
-    MV best_ref_mv = { 0, 0 };
-
-    // Reset above block coeffs.
-    recon_yoffset = (mb_row * recon_y_stride * 16);
-    recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
-
-    // Set up limit values for motion vectors to prevent them extending
-    // outside the UMV borders.
-    x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
-    x->mv_limits.row_max =
-        ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
-
-    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
-      int this_error;
-      int this_intra_error;
-      const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
-      const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
-      double log_intra;
-      int level_sample;
-
-#if CONFIG_FP_MB_STATS
-      const int mb_index = mb_row * cm->mb_cols + mb_col;
-#endif
-
-      vpx_clear_system_state();
-
-      xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
-      xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
-      xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
-      xd->mi[0]->sb_type = bsize;
-      xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-      set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize],
-                     mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
-                     cm->mi_rows, cm->mi_cols);
-      // Are edges available for intra prediction?
-      // Since the firstpass does not populate the mi_grid_visible,
-      // above_mi/left_mi must be overwritten with a nonzero value when edges
-      // are available.  Required by vp9_predict_intra_block().
-      xd->above_mi = (mb_row != 0) ? &mi_above : NULL;
-      xd->left_mi = (mb_col > tile.mi_col_start) ? &mi_left : NULL;
-
-      // Do intra 16x16 prediction.
-      x->skip_encode = 0;
-      xd->mi[0]->mode = DC_PRED;
-      xd->mi[0]->tx_size =
-          use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
-      vp9_encode_intra_block_plane(x, bsize, 0, 0);
-      this_error = vpx_get_mb_ss(x->plane[0].src_diff);
-      this_intra_error = this_error;
-
-      // Keep a record of blocks that have very low intra error residual
-      // (i.e. are in effect completely flat and untextured in the intra
-      // domain). In natural videos this is uncommon, but it is much more
-      // common in animations, graphics and screen content, so may be used
-      // as a signal to detect these types of content.
-      if (this_error < get_ul_intra_threshold(cm)) {
-        ++intra_skip_count;
-      } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
-        image_data_start_row = mb_row;
-      }
-
-      // Blocks that are mainly smooth in the intra domain.
-      // Some special accounting for CQ but also these are better for testing
-      // noise levels.
-      if (this_error < get_smooth_intra_threshold(cm)) {
-        ++intra_smooth_count;
-      }
-
-      // Special case noise measurement for first frame.
-      if (cm->current_video_frame == 0) {
-        if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
-          frame_noise_energy += fp_estimate_block_noise(x, bsize);
-        } else {
-          frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
-        }
-      }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        switch (cm->bit_depth) {
-          case VPX_BITS_8: break;
-          case VPX_BITS_10: this_error >>= 4; break;
-          case VPX_BITS_12: this_error >>= 8; break;
-          default:
-            assert(0 &&
-                   "cm->bit_depth should be VPX_BITS_8, "
-                   "VPX_BITS_10 or VPX_BITS_12");
-            return;
-        }
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-      vpx_clear_system_state();
-      log_intra = log(this_error + 1.0);
-      if (log_intra < 10.0)
-        intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
-      else
-        intra_factor += 1.0;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth)
-        level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
-      else
-        level_sample = x->plane[0].src.buf[0];
-#else
-      level_sample = x->plane[0].src.buf[0];
-#endif
-      if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
-        brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
-      else
-        brightness_factor += 1.0;
-
-      // Intrapenalty below deals with situations where the intra and inter
-      // error scores are very low (e.g. a plain black frame).
-      // We do not have special cases in first pass for 0,0 and nearest etc so
-      // all inter modes carry an overhead cost estimate for the mv.
-      // When the error score is very low this causes us to pick all or lots of
-      // INTRA modes and throw lots of key frames.
-      // This penalty adds a cost matching that of a 0,0 mv to the intra case.
-      this_error += intrapenalty;
-
-      // Accumulate the intra error.
-      intra_error += (int64_t)this_error;
-
-#if CONFIG_FP_MB_STATS
-      if (cpi->use_fp_mb_stats) {
-        // initialization
-        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-      }
-#endif
-
-      // Set up limit values for motion vectors to prevent them extending
-      // outside the UMV borders.
-      x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
-      x->mv_limits.col_max =
-          ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
-
-      // Other than for the first frame do a motion search.
-      if ((lc == NULL && cm->current_video_frame > 0) ||
-          (lc != NULL && lc->current_video_frame_in_layer > 0)) {
-        int tmp_err, motion_error, raw_motion_error;
-        // Assume 0,0 motion with no mv overhead.
-        MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
-        struct buf_2d unscaled_last_source_buf_2d;
-
-        xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          motion_error = highbd_get_prediction_error(
-              bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
-        } else {
-          motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                              &xd->plane[0].pre[0]);
-        }
-#else
-        motion_error =
-            get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-        // Compute the motion error of the 0,0 motion using the last source
-        // frame as the reference. Skip the further motion search on
-        // reconstructed frame if this error is small.
-        unscaled_last_source_buf_2d.buf =
-            cpi->unscaled_last_source->y_buffer + recon_yoffset;
-        unscaled_last_source_buf_2d.stride =
-            cpi->unscaled_last_source->y_stride;
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          raw_motion_error = highbd_get_prediction_error(
-              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
-        } else {
-          raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                  &unscaled_last_source_buf_2d);
-        }
-#else
-        raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                &unscaled_last_source_buf_2d);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-        // TODO(pengchong): Replace the hard-coded threshold
-        if (raw_motion_error > 25 || lc != NULL) {
-          // Test last reference frame using the previous best mv as the
-          // starting point (best reference) for the search.
-          first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
-
-          // If the current best reference mv is not centered on 0,0 then do a
-          // 0,0 based search as well.
-          if (!is_zero_mv(&best_ref_mv)) {
-            tmp_err = INT_MAX;
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
-
-            if (tmp_err < motion_error) {
-              motion_error = tmp_err;
-              mv = tmp_mv;
-            }
-          }
-
-          // Search in an older reference frame.
-          if (((lc == NULL && cm->current_video_frame > 1) ||
-               (lc != NULL && lc->current_video_frame_in_layer > 1)) &&
-              gld_yv12 != NULL) {
-            // Assume 0,0 motion with no mv overhead.
-            int gf_motion_error;
-
-            xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-#if CONFIG_VP9_HIGHBITDEPTH
-            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-              gf_motion_error = highbd_get_prediction_error(
-                  bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
-            } else {
-              gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                     &xd->plane[0].pre[0]);
-            }
-#else
-            gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                   &xd->plane[0].pre[0]);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
-                                     &gf_motion_error);
-
-            if (gf_motion_error < motion_error && gf_motion_error < this_error)
-              ++second_ref_count;
-
-            // Reset to last frame as reference buffer.
-            xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-            xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
-            xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
-
-            // In accumulating a score for the older reference frame take the
-            // best of the motion predicted score and the intra coded error
-            // (just as will be done for) accumulation of "coded_error" for
-            // the last frame.
-            if (gf_motion_error < this_error)
-              sr_coded_error += gf_motion_error;
-            else
-              sr_coded_error += this_error;
-          } else {
-            sr_coded_error += motion_error;
-          }
-        } else {
-          sr_coded_error += motion_error;
-        }
-
-        // Start by assuming that intra mode is best.
-        best_ref_mv.row = 0;
-        best_ref_mv.col = 0;
-
-#if CONFIG_FP_MB_STATS
-        if (cpi->use_fp_mb_stats) {
-          // intra prediction statistics
-          cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
-          if (this_error > FPMB_ERROR_LARGE_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
-          } else if (this_error < FPMB_ERROR_SMALL_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
-          }
-        }
-#endif
-
-        if (motion_error <= this_error) {
-          vpx_clear_system_state();
-
-          // Keep a count of cases where the inter and intra were very close
-          // and very low. This helps with scene cut detection for example in
-          // cropped clips with black bars at the sides or top and bottom.
-          if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
-              (this_error < (2 * intrapenalty))) {
-            neutral_count += 1.0;
-            // Also track cases where the intra is not much worse than the inter
-            // and use this in limiting the GF/arf group length.
-          } else if ((this_error > NCOUNT_INTRA_THRESH) &&
-                     (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
-            neutral_count +=
-                (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
-          }
-
-          mv.row *= 8;
-          mv.col *= 8;
-          this_error = motion_error;
-          xd->mi[0]->mode = NEWMV;
-          xd->mi[0]->mv[0].as_mv = mv;
-          xd->mi[0]->tx_size = TX_4X4;
-          xd->mi[0]->ref_frame[0] = LAST_FRAME;
-          xd->mi[0]->ref_frame[1] = NONE;
-          vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
-          vp9_encode_sby_pass1(x, bsize);
-          sum_mvr += mv.row;
-          sum_mvr_abs += abs(mv.row);
-          sum_mvc += mv.col;
-          sum_mvc_abs += abs(mv.col);
-          sum_mvrs += mv.row * mv.row;
-          sum_mvcs += mv.col * mv.col;
-          ++intercount;
-
-          best_ref_mv = mv;
-
-#if CONFIG_FP_MB_STATS
-          if (cpi->use_fp_mb_stats) {
-            // inter prediction statistics
-            cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
-            if (this_error > FPMB_ERROR_LARGE_TH) {
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_ERROR_LARGE_MASK;
-            } else if (this_error < FPMB_ERROR_SMALL_TH) {
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_ERROR_SMALL_MASK;
-            }
-          }
-#endif
-
-          if (!is_zero_mv(&mv)) {
-            ++mvcount;
-
-#if CONFIG_FP_MB_STATS
-            if (cpi->use_fp_mb_stats) {
-              cpi->twopass.frame_mb_stats_buf[mb_index] &=
-                  ~FPMB_MOTION_ZERO_MASK;
-              // check estimated motion direction
-              if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {
-                // right direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_RIGHT_MASK;
-              } else if (mv.as_mv.row < 0 &&
-                         abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {
-                // up direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_UP_MASK;
-              } else if (mv.as_mv.col < 0 &&
-                         abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {
-                // left direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_LEFT_MASK;
-              } else {
-                // down direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_DOWN_MASK;
-              }
-            }
-#endif
-
-            // Non-zero vector, was it different from the last non zero vector?
-            if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count;
-            lastmv = mv;
-
-            // Does the row vector point inwards or outwards?
-            if (mb_row < cm->mb_rows / 2) {
-              if (mv.row > 0)
-                --sum_in_vectors;
-              else if (mv.row < 0)
-                ++sum_in_vectors;
-            } else if (mb_row > cm->mb_rows / 2) {
-              if (mv.row > 0)
-                ++sum_in_vectors;
-              else if (mv.row < 0)
-                --sum_in_vectors;
-            }
-
-            // Does the col vector point inwards or outwards?
-            if (mb_col < cm->mb_cols / 2) {
-              if (mv.col > 0)
-                --sum_in_vectors;
-              else if (mv.col < 0)
-                ++sum_in_vectors;
-            } else if (mb_col > cm->mb_cols / 2) {
-              if (mv.col > 0)
-                ++sum_in_vectors;
-              else if (mv.col < 0)
-                --sum_in_vectors;
-            }
-            frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
-          } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
-            frame_noise_energy += fp_estimate_block_noise(x, bsize);
-          } else {  // 0,0 mv but high error
-            frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
-          }
-        } else {  // Intra < inter error
-          if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH))
-            frame_noise_energy += fp_estimate_block_noise(x, bsize);
-          else
-            frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
-        }
-      } else {
-        sr_coded_error += (int64_t)this_error;
-      }
-      coded_error += (int64_t)this_error;
-
-      // Adjust to the next column of MBs.
-      x->plane[0].src.buf += 16;
-      x->plane[1].src.buf += uv_mb_height;
-      x->plane[2].src.buf += uv_mb_height;
-
-      recon_yoffset += 16;
-      recon_uvoffset += uv_mb_height;
-    }
-
-    // Adjust to the next row of MBs.
-    x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
-    x->plane[1].src.buf +=
-        uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
-    x->plane[2].src.buf +=
-        uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
-
-    vpx_clear_system_state();
-  }
-
-  // Clamp the image start to rows/2. This number of rows is discarded top
-  // and bottom as dead data so rows / 2 means the frame is blank.
-  if ((image_data_start_row > cm->mb_rows / 2) ||
-      (image_data_start_row == INVALID_ROW)) {
-    image_data_start_row = cm->mb_rows / 2;
-  }
-  // Exclude any image dead zone
-  if (image_data_start_row > 0) {
-    intra_skip_count =
-        VPXMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
-  }
+  if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL)
+    CHECK_MEM_ERROR(
+        cm, cpi->twopass.fp_mb_float_stats,
+        vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1));
 
   {
     FIRSTPASS_STATS fps;
-    // The minimum error here insures some bit allocation to frames even
-    // in static regions. The allocation per MB declines for larger formats
-    // where the typical "real" energy per MB also falls.
-    // Initial estimate here uses sqrt(mbs) to define the min_err, where the
-    // number of mbs is proportional to the image area.
-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
-                            ? cpi->initial_mbs
-                            : cpi->common.MBs;
-    const double min_err = 200 * sqrt(num_mbs);
-
-    intra_factor = intra_factor / (double)num_mbs;
-    brightness_factor = brightness_factor / (double)num_mbs;
-    fps.weight = intra_factor * brightness_factor;
-
-    fps.frame = cm->current_video_frame;
-    fps.spatial_layer_id = cpi->svc.spatial_layer_id;
-    fps.coded_error = (double)(coded_error >> 8) + min_err;
-    fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
-    fps.intra_error = (double)(intra_error >> 8) + min_err;
-    fps.frame_noise_energy = (double)frame_noise_energy / (double)num_mbs;
-    fps.count = 1.0;
-    fps.pcnt_inter = (double)intercount / num_mbs;
-    fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
-    fps.pcnt_neutral = (double)neutral_count / num_mbs;
-    fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
-    fps.intra_smooth_pct = (double)intra_smooth_count / num_mbs;
-    fps.inactive_zone_rows = (double)image_data_start_row;
-    // Currently set to 0 as most issues relate to letter boxing.
-    fps.inactive_zone_cols = (double)0;
-
-    if (mvcount > 0) {
-      fps.MVr = (double)sum_mvr / mvcount;
-      fps.mvr_abs = (double)sum_mvr_abs / mvcount;
-      fps.MVc = (double)sum_mvc / mvcount;
-      fps.mvc_abs = (double)sum_mvc_abs / mvcount;
-      fps.MVrv =
-          ((double)sum_mvrs - ((double)sum_mvr * sum_mvr / mvcount)) / mvcount;
-      fps.MVcv =
-          ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
-      fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
-      fps.new_mv_count = new_mv_count;
-      fps.pcnt_motion = (double)mvcount / num_mbs;
+    TileDataEnc *first_tile_col;
+    if (!cpi->row_mt) {
+      cm->log2_tile_cols = 0;
+      cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy;
+      cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy;
+      first_pass_encode(cpi, fp_acc_data);
+      first_pass_stat_calc(cpi, &fps, fp_acc_data);
     } else {
-      fps.MVr = 0.0;
-      fps.mvr_abs = 0.0;
-      fps.MVc = 0.0;
-      fps.mvc_abs = 0.0;
-      fps.MVrv = 0.0;
-      fps.MVcv = 0.0;
-      fps.mv_in_out_count = 0.0;
-      fps.new_mv_count = 0.0;
-      fps.pcnt_motion = 0.0;
+      cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read;
+      cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write;
+      if (cpi->row_mt_bit_exact) {
+        cm->log2_tile_cols = 0;
+        vp9_zero_array(cpi->twopass.fp_mb_float_stats, cm->MBs);
+      }
+      vp9_encode_fp_row_mt(cpi);
+      first_tile_col = &cpi->tile_data[0];
+      if (cpi->row_mt_bit_exact)
+        accumulate_floating_point_stats(cpi, first_tile_col);
+      first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data));
     }
 
     // Dont allow a value of 0 for duration.
@@ -1345,14 +1560,22 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   if (cpi->use_svc) vp9_inc_frame_in_layer(cpi);
 }
 
-static double calc_correction_factor(double err_per_mb, double err_divisor,
-                                     double pt_low, double pt_high, int q,
-                                     vpx_bit_depth_t bit_depth) {
-  const double error_term = err_per_mb / err_divisor;
+static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = {
+  0.65, 0.70, 0.75, 0.85, 0.90, 0.90, 0.90, 1.00, 1.25
+};
 
-  // Adjustment based on actual quantizer to power term.
-  const double power_term =
-      VPXMIN(vp9_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
+static double calc_correction_factor(double err_per_mb, double err_divisor,
+                                     int q) {
+  const double error_term = err_per_mb / DOUBLE_DIVIDE_CHECK(err_divisor);
+  const int index = q >> 5;
+  double power_term;
+
+  assert((index >= 0) && (index < (QINDEX_RANGE >> 5)));
+
+  // Adjustment based on quantizer to the power term.
+  power_term =
+      q_pow_term[index] +
+      (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0);
 
   // Calculate correction factor.
   if (power_term < 1.0) assert(error_term >= 0.0);
@@ -1369,6 +1592,7 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   TWO_PASS *const twopass = &cpi->twopass;
+  double last_group_rate_err;
 
   // Clamp the target rate to VBR min / max limts.
   const int target_rate =
@@ -1377,24 +1601,35 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
   noise_factor = fclamp(noise_factor, NOISE_FACTOR_MIN, NOISE_FACTOR_MAX);
   inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
 
+// TODO(jimbankoski): remove #if here or below when this has been
+// well tested.
+#if CONFIG_ALWAYS_ADJUST_BPM
+  // based on recent history adjust expectations of bits per macroblock.
+  last_group_rate_err =
+      (double)twopass->rolling_arf_group_actual_bits /
+      DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
+  last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err));
+  twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
+  twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor));
+#endif
+
   if (target_rate <= 0) {
     return rc->worst_quality;  // Highest value allowed
   } else {
     const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
                             ? cpi->initial_mbs
                             : cpi->common.MBs;
-    const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
-    const double av_err_per_mb = section_err / active_mbs;
+    const double active_pct = VPXMAX(0.01, 1.0 - inactive_zone);
+    const int active_mbs = (int)VPXMAX(1, (double)num_mbs * active_pct);
+    const double av_err_per_mb = section_err / active_pct;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
-    double last_group_rate_err;
     const int target_norm_bits_per_mb =
         (int)(((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs);
     int q;
-    int is_svc_upper_layer = 0;
-
-    if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
-      is_svc_upper_layer = 1;
 
+// TODO(jimbankoski): remove #if here or above when this has been
+// well tested.
+#if !CONFIG_ALWAYS_ADJUST_BPM
     // based on recent history adjust expectations of bits per macroblock.
     last_group_rate_err =
         (double)twopass->rolling_arf_group_actual_bits /
@@ -1402,14 +1637,13 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
     last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err));
     twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
     twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor));
+#endif
 
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
-      const double factor = calc_correction_factor(
-          av_err_per_mb, ERR_DIVISOR,
-          is_svc_upper_layer ? SVC_FACTOR_PT_LOW : FACTOR_PT_LOW,
-          FACTOR_PT_HIGH, q, cpi->common.bit_depth);
+      const double factor =
+          calc_correction_factor(av_err_per_mb, ERR_DIVISOR, q);
       const int bits_per_mb = vp9_rc_bits_per_mb(
           INTER_FRAME, q,
           factor * speed_term * cpi->twopass.bpm_factor * noise_factor,
@@ -1457,7 +1691,7 @@ void calculate_coded_size(VP9_COMP *cpi, int *scaled_frame_width,
 
 void vp9_init_second_pass(VP9_COMP *cpi) {
   SVC *const svc = &cpi->svc;
-  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const int is_two_pass_svc =
       (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1);
   RATE_CONTROL *const rc = &cpi->rc;
@@ -1477,6 +1711,63 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   *stats = *twopass->stats_in_end;
   twopass->total_left_stats = *stats;
 
+  // Scan the first pass file and calculate a modified score for each
+  // frame that is used to distribute bits. The modified score is assumed
+  // to provide a linear basis for bit allocation. I.e a frame A with a score
+  // that is double that of frame B will be allocated 2x as many bits.
+  {
+    double modified_score_total = 0.0;
+    const FIRSTPASS_STATS *s = twopass->stats_in;
+    double av_err;
+
+    if (oxcf->vbr_corpus_complexity) {
+      twopass->mean_mod_score = (double)oxcf->vbr_corpus_complexity / 10.0;
+      av_err = get_distribution_av_err(cpi, twopass);
+    } else {
+      av_err = get_distribution_av_err(cpi, twopass);
+      // The first scan is unclamped and gives a raw average.
+      while (s < twopass->stats_in_end) {
+        modified_score_total += calculate_mod_frame_score(cpi, oxcf, s, av_err);
+        ++s;
+      }
+
+      // The average error from this first scan is used to define the midpoint
+      // error for the rate distribution function.
+      twopass->mean_mod_score =
+          modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count);
+    }
+
+    // Second scan using clamps based on the previous cycle average.
+    // This may modify the total and average somewhat but we dont bother with
+    // further itterations.
+    modified_score_total = 0.0;
+    s = twopass->stats_in;
+    while (s < twopass->stats_in_end) {
+      modified_score_total +=
+          calculate_norm_frame_score(cpi, twopass, oxcf, s, av_err);
+      ++s;
+    }
+    twopass->normalized_score_left = modified_score_total;
+
+    // If using Corpus wide VBR mode then update the clip target bandwidth to
+    // reflect how the clip compares to the rest of the corpus.
+    if (oxcf->vbr_corpus_complexity) {
+      oxcf->target_bandwidth =
+          (int64_t)((double)oxcf->target_bandwidth *
+                    (twopass->normalized_score_left / stats->count));
+    }
+
+#if COMPLEXITY_STATS_OUTPUT
+    {
+      FILE *compstats;
+      compstats = fopen("complexity_stats.stt", "a");
+      fprintf(compstats, "%10.3lf\n",
+              twopass->normalized_score_left / stats->count);
+      fclose(compstats);
+    }
+#endif
+  }
+
   frame_rate = 10000000.0 * stats->count / stats->duration;
   // Each frame can have a different duration, as the frame rate in the source
   // isn't guaranteed to be constant. The frame rate prior to the first frame
@@ -1499,24 +1790,6 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   // This variable monitors how far behind the second ref update is lagging.
   twopass->sr_update_lag = 1;
 
-  // Scan the first pass file and calculate a modified total error based upon
-  // the bias/power function used to allocate bits.
-  {
-    const double avg_error =
-        stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
-    const FIRSTPASS_STATS *s = twopass->stats_in;
-    double modified_error_total = 0.0;
-    twopass->modified_error_min =
-        (avg_error * oxcf->two_pass_vbrmin_section) / 100;
-    twopass->modified_error_max =
-        (avg_error * oxcf->two_pass_vbrmax_section) / 100;
-    while (s < twopass->stats_in_end) {
-      modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
-      ++s;
-    }
-    twopass->modified_error_left = modified_error_total;
-  }
-
   // Reset the vbr bits off target counters
   rc->vbr_bits_off_target = 0;
   rc->vbr_bits_off_target_fast = 0;
@@ -1551,9 +1824,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
 
 static double get_sr_decay_rate(const VP9_COMP *cpi,
                                 const FIRSTPASS_STATS *frame) {
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                             : cpi->common.MBs;
-  double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+  double sr_diff = (frame->sr_coded_error - frame->coded_error);
   double sr_decay = 1.0;
   double modified_pct_inter;
   double modified_pcnt_intra;
@@ -1562,10 +1833,11 @@ static double get_sr_decay_rate(const VP9_COMP *cpi,
                             (cpi->initial_height + cpi->initial_width));
 
   modified_pct_inter = frame->pcnt_inter;
-  if (((frame->coded_error / num_mbs) > LOW_CODED_ERR_PER_MB) &&
+  if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
       ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
        (double)NCOUNT_FRAME_II_THRESH)) {
-    modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+    modified_pct_inter =
+        frame->pcnt_inter + frame->pcnt_intra_low - frame->pcnt_neutral;
   }
   modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
 
@@ -1574,7 +1846,7 @@ static double get_sr_decay_rate(const VP9_COMP *cpi,
     sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - motion_amplitude_part -
                (INTRA_PART * modified_pcnt_intra);
   }
-  return VPXMAX(sr_decay, VPXMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+  return VPXMAX(sr_decay, DEFAULT_DECAY_LIMIT);
 }
 
 // This function gives an estimate of how badly we believe the prediction
@@ -1675,38 +1947,63 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
   }
 }
 
-#define BASELINE_ERR_PER_MB 1000.0
+#define BASELINE_ERR_PER_MB 12500.0
+#define GF_MAX_BOOST 96.0
 static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
-                               double this_frame_mv_in_out, double max_boost) {
+                               double this_frame_mv_in_out) {
   double frame_boost;
   const double lq = vp9_convert_qindex_to_q(
       cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
   const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5);
-  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                       : cpi->common.MBs;
-
-  // Correct for any inactive region in the image
-  num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+  const double active_area = calculate_active_area(cpi, this_frame);
 
   // Underlying boost factor is based on inter error ratio.
-  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+  frame_boost = (BASELINE_ERR_PER_MB * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
-  frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
 
-  // Increase boost for frames where new data coming into frame (e.g. zoom out).
-  // Slightly reduce boost if there is a net balance of motion out of the frame
-  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+  // Small adjustment for cases where there is a zoom out
   if (this_frame_mv_in_out > 0.0)
     frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
-  // In the extreme case the boost is halved.
-  else
-    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+  // Q correction and scalling
+  frame_boost = frame_boost * boost_q_correction;
+
+  return VPXMIN(frame_boost, GF_MAX_BOOST * boost_q_correction);
+}
+
+#define KF_BASELINE_ERR_PER_MB 12500.0
+static double calc_kf_frame_boost(VP9_COMP *cpi,
+                                  const FIRSTPASS_STATS *this_frame,
+                                  double *sr_accumulator,
+                                  double this_frame_mv_in_out,
+                                  double max_boost) {
+  double frame_boost;
+  const double lq = vp9_convert_qindex_to_q(
+      cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
+  const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00);
+  const double active_area = calculate_active_area(cpi, this_frame);
+
+  // Underlying boost factor is based on inter error ratio.
+  frame_boost = (KF_BASELINE_ERR_PER_MB * active_area) /
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
+
+  // Update the accumulator for second ref error difference.
+  // This is intended to give an indication of how much the coded error is
+  // increasing over time.
+  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
+  *sr_accumulator = VPXMAX(0.0, *sr_accumulator);
+
+  // Small adjustment for cases where there is a zoom out
+  if (this_frame_mv_in_out > 0.0)
+    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+
+  // Q correction and scalling
+  frame_boost = frame_boost * boost_q_correction;
 
   return VPXMIN(frame_boost, max_boost * boost_q_correction);
 }
 
-static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
-                          int *f_boost, int *b_boost) {
+static int calc_arf_boost(VP9_COMP *cpi, int f_frames, int b_frames) {
   TWO_PASS *const twopass = &cpi->twopass;
   int i;
   double boost_score = 0.0;
@@ -1720,7 +2017,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
 
   // Search forward from the proposed arf/next gf position.
   for (i = 0; i < f_frames; ++i) {
-    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i);
     if (this_frame == NULL) break;
 
     // Update the motion related elements to the boost calculation.
@@ -1730,8 +2027,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
 
     // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(twopass, i + offset) ||
-                     detect_flash(twopass, i + offset + 1);
+    flash_detected = detect_flash(twopass, i) || detect_flash(twopass, i + 1);
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
@@ -1740,13 +2036,11 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
                               ? MIN_DECAY_FACTOR
                               : decay_accumulator;
     }
-
-    boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+    boost_score += decay_accumulator *
+                   calc_frame_boost(cpi, this_frame, this_frame_mv_in_out);
   }
 
-  *f_boost = (int)boost_score;
+  arf_boost = (int)boost_score;
 
   // Reset for backward looking loop.
   boost_score = 0.0;
@@ -1758,7 +2052,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
 
   // Search backward towards last gf position.
   for (i = -1; i >= -b_frames; --i) {
-    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i);
     if (this_frame == NULL) break;
 
     // Update the motion related elements to the boost calculation.
@@ -1768,8 +2062,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
 
     // We want to discount the the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(twopass, i + offset) ||
-                     detect_flash(twopass, i + offset + 1);
+    flash_detected = detect_flash(twopass, i) || detect_flash(twopass, i + 1);
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
@@ -1778,16 +2071,13 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
                               ? MIN_DECAY_FACTOR
                               : decay_accumulator;
     }
-
-    boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+    boost_score += decay_accumulator *
+                   calc_frame_boost(cpi, this_frame, this_frame_mv_in_out);
   }
-  *b_boost = (int)boost_score;
+  arf_boost += (int)boost_score;
 
-  arf_boost = (*f_boost + *b_boost);
-  if (arf_boost < ((b_frames + f_frames) * 20))
-    arf_boost = ((b_frames + f_frames) * 20);
+  if (arf_boost < ((b_frames + f_frames) * 40))
+    arf_boost = ((b_frames + f_frames) * 40);
   arf_boost = VPXMAX(arf_boost, MIN_ARF_GF_BOOST);
 
   return arf_boost;
@@ -1821,7 +2111,7 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
   int64_t total_group_bits;
 
   // Calculate the bits to be allocated to the group as a whole.
-  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0.0)) {
     total_group_bits = (int64_t)(twopass->kf_group_bits *
                                  (gf_group_err / twopass->kf_group_error_left));
   } else {
@@ -1829,10 +2119,11 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
   }
 
   // Clamp odd edge cases.
-  total_group_bits =
-      (total_group_bits < 0) ? 0 : (total_group_bits > twopass->kf_group_bits)
-                                       ? twopass->kf_group_bits
-                                       : total_group_bits;
+  total_group_bits = (total_group_bits < 0)
+                         ? 0
+                         : (total_group_bits > twopass->kf_group_bits)
+                               ? twopass->kf_group_bits
+                               : total_group_bits;
 
   // Clip based on user supplied data rate variability limit.
   if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
@@ -1847,7 +2138,7 @@ static int calculate_boost_bits(int frame_count, int boost,
   int allocation_chunks;
 
   // return 0 for invalid inputs (could arise e.g. through rounding errors)
-  if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0;
+  if (!boost || (total_group_bits <= 0) || (frame_count < 0)) return 0;
 
   allocation_chunks = (frame_count * 100) + boost;
 
@@ -1875,8 +2166,33 @@ static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
   arf_buffer_indices[1] = ARF_SLOT2;
 }
 
+// Used in corpus vbr: Calculates the total normalized group complexity score
+// for a given number of frames starting at the current position in the stats
+// file.
+static double calculate_group_score(VP9_COMP *cpi, double av_score,
+                                    int frame_count) {
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  const FIRSTPASS_STATS *s = twopass->stats_in;
+  double score_total = 0.0;
+  int i = 0;
+
+  // We dont ever want to return a 0 score here.
+  if (frame_count == 0) return 1.0;
+
+  while ((i < frame_count) && (s < twopass->stats_in_end)) {
+    score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s, av_score);
+    ++s;
+    ++i;
+  }
+  assert(i == frame_count);
+
+  return score_total;
+}
+
 static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
                                    int gf_arf_bits) {
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
   GF_GROUP *const gf_group = &twopass->gf_group;
@@ -1885,7 +2201,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
   int frame_index = 1;
   int target_frame_size;
   int key_frame;
-  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+  const int max_bits = frame_max_bits(&cpi->rc, oxcf);
   int64_t total_group_bits = gf_group_bits;
   int mid_boost_bits = 0;
   int mid_frame_idx;
@@ -1895,8 +2211,10 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
       is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1;
   int normal_frames;
   int normal_frame_bits;
-  int last_frame_bits;
-  int last_frame_reduction;
+  int last_frame_reduction = 0;
+  double av_score = 1.0;
+  double tot_norm_frame_score = 1.0;
+  double this_frame_score = 1.0;
 
   // Only encode alt reference frame in temporal base layer.
   if (has_temporal_layers) alt_frame_index = cpi->svc.number_temporal_layers;
@@ -1968,17 +2286,14 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
   mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
 
   normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending);
-
-  // The last frame in the group is used less as a predictor so reduce
-  // its allocation a little.
-  if (normal_frames > 1) {
+  if (normal_frames > 1)
     normal_frame_bits = (int)(total_group_bits / normal_frames);
-    last_frame_reduction = normal_frame_bits / 16;
-    last_frame_bits = normal_frame_bits - last_frame_reduction;
-  } else {
+  else
     normal_frame_bits = (int)total_group_bits;
-    last_frame_bits = normal_frame_bits;
-    last_frame_reduction = 0;
+
+  if (oxcf->vbr_corpus_complexity) {
+    av_score = get_distribution_av_err(cpi, twopass);
+    tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames);
   }
 
   // Allocate bits to the other frames in the group.
@@ -1990,11 +2305,18 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
       ++frame_index;
     }
 
-    target_frame_size = (i == (normal_frames - 1))
-                            ? last_frame_bits
-                            : (i == mid_frame_idx)
-                                  ? normal_frame_bits + last_frame_reduction
-                                  : normal_frame_bits;
+    if (oxcf->vbr_corpus_complexity) {
+      this_frame_score = calculate_norm_frame_score(cpi, twopass, oxcf,
+                                                    &frame_stats, av_score);
+      normal_frame_bits = (int)((double)total_group_bits *
+                                (this_frame_score / tot_norm_frame_score));
+    }
+
+    target_frame_size = normal_frame_bits;
+    if ((i == (normal_frames - 1)) && (i >= 1)) {
+      last_frame_reduction = normal_frame_bits / 16;
+      target_frame_size -= last_frame_reduction;
+    }
 
     if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
       mid_boost_bits += (target_frame_size >> 4);
@@ -2015,6 +2337,9 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
     ++frame_index;
   }
 
+  // Add in some extra bits for the middle frame in the group.
+  gf_group->bit_allocation[mid_frame_idx] += last_frame_reduction;
+
   // Note:
   // We need to configure the frame at the end of the sequence + 1 that will be
   // the start frame for the next group. Otherwise prior to the call to
@@ -2057,6 +2382,9 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise,
 }
 
 // Analyse and define a gf/arf group.
+#define ARF_DECAY_BREAKOUT 0.10
+#define ARF_ABS_ZOOM_THRESH 4.0
+
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2066,8 +2394,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
   int i;
 
-  double boost_score = 0.0;
-  double old_boost_score = 0.0;
   double gf_group_err = 0.0;
   double gf_group_raw_error = 0.0;
   double gf_group_noise = 0.0;
@@ -2079,9 +2405,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double mod_frame_err = 0.0;
 
   double mv_ratio_accumulator = 0.0;
-  double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
-
   double loop_decay_rate = 1.00;
   double last_loop_decay_rate = 1.00;
 
@@ -2089,10 +2413,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
   double mv_ratio_accumulator_thresh;
+  double abs_mv_in_out_thresh;
+  double sr_accumulator = 0.0;
+  const double av_err = get_distribution_av_err(cpi, twopass);
   unsigned int allow_alt_ref = is_altref_enabled(cpi);
 
-  int f_boost = 0;
-  int b_boost = 0;
   int flash_detected;
   int active_max_gf_interval;
   int active_min_gf_interval;
@@ -2111,7 +2436,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   vp9_zero(next_frame);
 
   // Load stats for the current frame.
-  mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+  mod_frame_err =
+      calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
 
   // Note the error of the frame at the start of the group. This will be
   // the GF frame error if we code a normal gf.
@@ -2132,6 +2458,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Motion breakout threshold for loop below depends on image size.
   mv_ratio_accumulator_thresh =
       (cpi->initial_height + cpi->initial_width) / 4.0;
+  abs_mv_in_out_thresh = ARF_ABS_ZOOM_THRESH;
 
   // Set a maximum and minimum interval for the GF group.
   // If the image appears almost completely static we can extend beyond this.
@@ -2142,8 +2469,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
                                                 cpi->common.bit_depth));
     active_min_gf_interval =
         rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200);
-    if (active_min_gf_interval > rc->max_gf_interval)
-      active_min_gf_interval = rc->max_gf_interval;
+    active_min_gf_interval =
+        VPXMIN(active_min_gf_interval, rc->max_gf_interval + arf_active_or_kf);
 
     if (cpi->multi_arf_allowed) {
       active_max_gf_interval = rc->max_gf_interval;
@@ -2154,11 +2481,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       // interval to spread the cost of the GF.
       active_max_gf_interval = 12 + arf_active_or_kf + VPXMIN(4, (int_lbq / 6));
 
-      // We have: active_min_gf_interval <= rc->max_gf_interval
-      if (active_max_gf_interval < active_min_gf_interval)
+      // We have: active_min_gf_interval <=
+      // rc->max_gf_interval + arf_active_or_kf.
+      if (active_max_gf_interval < active_min_gf_interval) {
         active_max_gf_interval = active_min_gf_interval;
-      else if (active_max_gf_interval > rc->max_gf_interval)
-        active_max_gf_interval = rc->max_gf_interval;
+      } else {
+        active_max_gf_interval = VPXMIN(active_max_gf_interval,
+                                        rc->max_gf_interval + arf_active_or_kf);
+      }
 
       // Would the active max drop us out just before the near the next kf?
       if ((active_max_gf_interval <= rc->frames_to_key) &&
@@ -2172,7 +2502,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     ++i;
 
     // Accumulate error score of frames in this gf group.
-    mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    mod_frame_err =
+        calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
     gf_group_err += mod_frame_err;
     gf_group_raw_error += this_frame->coded_error;
     gf_group_noise += this_frame->frame_noise_energy;
@@ -2197,8 +2528,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       last_loop_decay_rate = loop_decay_rate;
       loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
-      decay_accumulator = decay_accumulator * loop_decay_rate;
-
       // Monitor for static sections.
       zero_motion_accumulator = VPXMIN(
           zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
@@ -2210,12 +2539,16 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         allow_alt_ref = 0;
         break;
       }
-    }
 
-    // Calculate a boost number for this frame.
-    boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+      // Update the accumulator for second ref error difference.
+      // This is intended to give an indication of how much the coded error is
+      // increasing over time.
+      if (i == 1) {
+        sr_accumulator += next_frame.coded_error;
+      } else {
+        sr_accumulator += (next_frame.sr_coded_error - next_frame.coded_error);
+      }
+    }
 
     // Break out conditions.
     if (
@@ -2228,15 +2561,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
             ((rc->frames_to_key - i) >= rc->min_gf_interval) &&
             (!flash_detected) &&
             ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
-             (abs_mv_in_out_accumulator > 3.0) ||
-             (mv_in_out_accumulator < -2.0) ||
-             ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
-      boost_score = old_boost_score;
+             (abs_mv_in_out_accumulator > abs_mv_in_out_thresh) ||
+             (sr_accumulator > next_frame.intra_error)))) {
       break;
     }
 
     *this_frame = next_frame;
-    old_boost_score = boost_score;
   }
 
   // Was the group length constrained by the requirement for a new KF?
@@ -2245,9 +2575,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Should we use the alternate reference frame.
   if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
       (i >= rc->min_gf_interval)) {
+    const int forward_frames = (rc->frames_to_key - i >= i - 1)
+                                   ? i - 1
+                                   : VPXMAX(0, rc->frames_to_key - i);
+
     // Calculate the boost for alt ref.
-    rc->gfu_boost =
-        calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+    rc->gfu_boost = calc_arf_boost(cpi, forward_frames, (i - 1));
     rc->source_alt_ref_pending = 1;
 
     // Test to see if multi arf is appropriate.
@@ -2257,10 +2590,17 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
             ? 1
             : 0;
   } else {
-    rc->gfu_boost = VPXMAX((int)boost_score, MIN_ARF_GF_BOOST);
+    rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1));
     rc->source_alt_ref_pending = 0;
   }
 
+#ifdef AGGRESSIVE_VBR
+  // Limit maximum boost based on interval length.
+  rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 140);
+#else
+  rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 200);
+#endif
+
   // Set the interval until the next gf.
   rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
 
@@ -2273,7 +2613,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     int j;
     for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
       if (EOF == input_stats(twopass, this_frame)) break;
-      gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+      gf_group_err +=
+          calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
       gf_group_raw_error += this_frame->coded_error;
       gf_group_noise += this_frame->frame_noise_energy;
       gf_group_skip_pct += this_frame->intra_skip_pct;
@@ -2293,7 +2634,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
 
   // Calculate an estimate of the maxq needed for the group.
-  // We are more agressive about correcting for sections
+  // We are more aggressive about correcting for sections
   // where there could be significant overshoot than for easier
   // sections where we do not wish to risk creating an overshoot
   // of the allocated bit budget.
@@ -2312,6 +2653,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         group_av_noise, vbr_group_bits_per_frame);
     twopass->active_worst_quality =
         (tmp_q + (twopass->active_worst_quality * 3)) >> 2;
+
+#if CONFIG_ALWAYS_ADJUST_BPM
+    // Reset rolling actual and target bits counters for ARF groups.
+    twopass->rolling_arf_group_target_bits = 0;
+    twopass->rolling_arf_group_actual_bits = 0;
+#endif
   }
 
   // Context Adjustment of ARNR filter strength
@@ -2328,7 +2675,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
                                      gf_group_bits);
 
   // Adjust KF group bits and error remaining.
-  twopass->kf_group_error_left -= (int64_t)gf_group_err;
+  twopass->kf_group_error_left -= gf_group_err;
 
   // Allocate bits to each of the frames in the GF group.
   allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits);
@@ -2346,10 +2693,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Default to starting GF groups at normal frame size.
     cpi->rc.next_frame_size_selector = UNSCALED;
   }
-
+#if !CONFIG_ALWAYS_ADJUST_BPM
   // Reset rolling actual and target bits counters for ARF groups.
   twopass->rolling_arf_group_target_bits = 0;
   twopass->rolling_arf_group_actual_bits = 0;
+#endif
 }
 
 // Threshold for use of the lagging second reference frame. High second ref
@@ -2377,6 +2725,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 // ratio in the next frame.
 #define II_IMPROVEMENT_THRESHOLD 3.5
 #define KF_II_MAX 128.0
+#define II_FACTOR 12.5
+// Test for very low intra complexity which could cause false key frames
+#define V_LOW_INTRA 0.5
 
 static int test_candidate_kf(TWO_PASS *twopass,
                              const FIRSTPASS_STATS *last_frame,
@@ -2416,7 +2767,7 @@ static int test_candidate_kf(TWO_PASS *twopass,
 
     // Examine how well the key frame predicts subsequent frames.
     for (i = 0; i < 16; ++i) {
-      double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
+      double next_iiratio = (II_FACTOR * local_next_frame.intra_error /
                              DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
 
       if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
@@ -2436,7 +2787,7 @@ static int test_candidate_kf(TWO_PASS *twopass,
             0.20) &&
            (next_iiratio < 3.0)) ||
           ((boost_score - old_boost_score) < 3.0) ||
-          (local_next_frame.intra_error < 200)) {
+          (local_next_frame.intra_error < V_LOW_INTRA)) {
         break;
       }
 
@@ -2462,6 +2813,16 @@ static int test_candidate_kf(TWO_PASS *twopass,
 }
 
 #define FRAMES_TO_CHECK_DECAY 8
+#define MIN_KF_TOT_BOOST 300
+#define KF_BOOST_SCAN_MAX_FRAMES 32
+
+#ifdef AGGRESSIVE_VBR
+#define KF_MAX_FRAME_BOOST 80.0
+#define MAX_KF_TOT_BOOST 4800
+#else
+#define KF_MAX_FRAME_BOOST 96.0
+#define MAX_KF_TOT_BOOST 5400
+#endif
 
 static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int i, j;
@@ -2474,15 +2835,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   FIRSTPASS_STATS next_frame;
   FIRSTPASS_STATS last_frame;
   int kf_bits = 0;
-  int loop_decay_counter = 0;
   double decay_accumulator = 1.0;
-  double av_decay_accumulator = 0.0;
   double zero_motion_accumulator = 1.0;
   double boost_score = 0.0;
   double kf_mod_err = 0.0;
   double kf_group_err = 0.0;
   double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
-
+  double sr_accumulator = 0.0;
+  const double av_err = get_distribution_av_err(cpi, twopass);
   vp9_zero(next_frame);
 
   cpi->common.frame_type = KEY_FRAME;
@@ -2503,10 +2863,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   rc->frames_to_key = 1;
 
-  twopass->kf_group_bits = 0;        // Total bits available to kf group
-  twopass->kf_group_error_left = 0;  // Group modified error score.
+  twopass->kf_group_bits = 0;          // Total bits available to kf group
+  twopass->kf_group_error_left = 0.0;  // Group modified error score.
 
-  kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+  kf_mod_err =
+      calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
 
   // Initialize the decay rates for the recent frames to check
   for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
@@ -2516,7 +2877,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   while (twopass->stats_in < twopass->stats_in_end &&
          rc->frames_to_key < cpi->oxcf.key_freq) {
     // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    kf_group_err +=
+        calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
 
     // Load the next frame's stats.
     last_frame = *this_frame;
@@ -2576,7 +2938,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     // Rescan to get the correct error data for the forced kf group.
     for (i = 0; i < rc->frames_to_key; ++i) {
-      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
+      kf_group_err +=
+          calculate_norm_frame_score(cpi, twopass, oxcf, &tmp_frame, av_err);
       input_stats(twopass, &tmp_frame);
     }
     rc->next_key_frame_forced = 1;
@@ -2593,7 +2956,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     int j;
     for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
       if (EOF == input_stats(twopass, this_frame)) break;
-      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+      kf_group_err +=
+          calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
     }
     rc->frames_to_key = new_frame_to_key;
   }
@@ -2601,11 +2965,12 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Special case for the last key frame of the file.
   if (twopass->stats_in >= twopass->stats_in_end) {
     // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    kf_group_err +=
+        calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
-  if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+  if (twopass->bits_left > 0 && twopass->normalized_score_left > 0.0) {
     // Maximum number of bits for a single normal frame (not key frame).
     const int max_bits = frame_max_bits(rc, &cpi->oxcf);
 
@@ -2615,7 +2980,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Default allocation based on bits left and relative
     // complexity of the section.
     twopass->kf_group_bits = (int64_t)(
-        twopass->bits_left * (kf_group_err / twopass->modified_error_left));
+        twopass->bits_left * (kf_group_err / twopass->normalized_score_left));
 
     // Clip based on maximum per frame rate defined by the user.
     max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
@@ -2631,34 +2996,36 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   // Scan through the kf group collating various stats used to determine
   // how many bits to spend on it.
-  decay_accumulator = 1.0;
   boost_score = 0.0;
+
   for (i = 0; i < (rc->frames_to_key - 1); ++i) {
     if (EOF == input_stats(twopass, &next_frame)) break;
 
-    // Monitor for static sections.
-    zero_motion_accumulator = VPXMIN(zero_motion_accumulator,
-                                     get_zero_motion_factor(cpi, &next_frame));
+    if (i <= KF_BOOST_SCAN_MAX_FRAMES) {
+      double frame_boost;
+      double zm_factor;
 
-    // Not all frames in the group are necessarily used in calculating boost.
-    if ((i <= rc->max_gf_interval) ||
-        ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
-      const double frame_boost =
-          calc_frame_boost(cpi, &next_frame, 0, KF_MAX_BOOST);
+      // Monitor for static sections.
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
 
-      // How fast is prediction quality decaying.
-      if (!detect_flash(twopass, 0)) {
-        const double loop_decay_rate =
-            get_prediction_decay_rate(cpi, &next_frame);
-        decay_accumulator *= loop_decay_rate;
-        decay_accumulator = VPXMAX(decay_accumulator, MIN_DECAY_FACTOR);
-        av_decay_accumulator += decay_accumulator;
-        ++loop_decay_counter;
-      }
-      boost_score += (decay_accumulator * frame_boost);
+      // Factor 0.75-1.25 based on how much of frame is static.
+      zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
+
+      // The second (lagging) ref error is not valid immediately after
+      // a key frame because either the lag has not built up (in the case of
+      // the first key frame or it points to a refernce before the new key
+      // frame.
+      if (i < 2) sr_accumulator = 0.0;
+      frame_boost = calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0,
+                                        KF_MAX_FRAME_BOOST * zm_factor);
+
+      boost_score += frame_boost;
+      if (frame_boost < 25.00) break;
+    } else {
+      break;
     }
   }
-  av_decay_accumulator /= (double)loop_decay_counter;
 
   reset_fpf_position(twopass, start_position);
 
@@ -2670,9 +3037,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       start_position, twopass->stats_in_end, rc->frames_to_key);
 
   // Apply various clamps for min and max boost
-  rc->kf_boost = (int)(av_decay_accumulator * boost_score);
-  rc->kf_boost = VPXMAX(rc->kf_boost, (rc->frames_to_key * 3));
-  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_BOOST);
+  rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
+  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
+  rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
 
   // Work out how many bits to allocate for the key frame itself.
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
@@ -2686,12 +3053,12 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   gf_group->rf_level[0] = KF_STD;
 
   // Note the total error score of the kf group minus the key frame itself.
-  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+  twopass->kf_group_error_left = (kf_group_err - kf_mod_err);
 
   // Adjust the count of total modified error left.
   // The count of bits left is adjusted elsewhere based on real coded frame
   // sizes.
-  twopass->modified_error_left -= kf_group_err;
+  twopass->normalized_score_left -= kf_group_err;
 
   if (oxcf->resize_mode == RESIZE_DYNAMIC) {
     // Default to normal-sized frame on keyframes.
@@ -2923,16 +3290,10 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   target_rate = gf_group->bit_allocation[gf_group->index];
   rc->base_frame_target = target_rate;
 
-  {
-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
-                            ? cpi->initial_mbs
-                            : cpi->common.MBs;
-    // The multiplication by 256 reverses a scaling factor of (>> 8)
-    // applied when combining MB error values for the frame.
-    twopass->mb_av_energy =
-        log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
-    twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
-  }
+  // The multiplication by 256 reverses a scaling factor of (>> 8)
+  // applied when combining MB error values for the frame.
+  twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0);
+  twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
 
   // Update the total stats remaining structure.
   subtract_stats(&twopass->total_left_stats, &this_frame);
diff --git a/libs/libvpx/vp9/encoder/vp9_firstpass.h b/libs/libvpx/vp9/encoder/vp9_firstpass.h
index 6aa39cdc00..000ecd7792 100644
--- a/libs/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/libs/libvpx/vp9/encoder/vp9_firstpass.h
@@ -39,6 +39,39 @@ typedef struct {
 } FIRSTPASS_MB_STATS;
 #endif
 
+#define INVALID_ROW -1
+
+typedef struct {
+  double frame_mb_intra_factor;
+  double frame_mb_brightness_factor;
+  double frame_mb_neutral_count;
+} FP_MB_FLOAT_STATS;
+
+typedef struct {
+  double intra_factor;
+  double brightness_factor;
+  int64_t coded_error;
+  int64_t sr_coded_error;
+  int64_t frame_noise_energy;
+  int64_t intra_error;
+  int intercount;
+  int second_ref_count;
+  double neutral_count;
+  double intra_count_low;   // Coded intra but low variance
+  double intra_count_high;  // Coded intra high variance
+  int intra_skip_count;
+  int image_data_start_row;
+  int mvcount;
+  int sum_mvr;
+  int sum_mvr_abs;
+  int sum_mvc;
+  int sum_mvc_abs;
+  int64_t sum_mvrs;
+  int64_t sum_mvcs;
+  int sum_in_vectors;
+  int intra_smooth_count;
+} FIRSTPASS_DATA;
+
 typedef struct {
   double frame;
   double weight;
@@ -50,6 +83,8 @@ typedef struct {
   double pcnt_motion;
   double pcnt_second_ref;
   double pcnt_neutral;
+  double pcnt_intra_low;   // Coded intra but low variance
+  double pcnt_intra_high;  // Coded intra high variance
   double intra_skip_pct;
   double intra_smooth_pct;    // % of blocks that are smooth
   double inactive_zone_rows;  // Image mask rows top and bottom.
@@ -61,7 +96,6 @@ typedef struct {
   double MVrv;
   double MVcv;
   double mv_in_out_count;
-  double new_mv_count;
   double duration;
   double count;
   int64_t spatial_layer_id;
@@ -104,9 +138,8 @@ typedef struct {
   FIRSTPASS_STATS total_left_stats;
   int first_pass_done;
   int64_t bits_left;
-  double modified_error_min;
-  double modified_error_max;
-  double modified_error_left;
+  double mean_mod_score;
+  double normalized_score_left;
   double mb_av_energy;
   double mb_smooth_pct;
 
@@ -115,6 +148,9 @@ typedef struct {
   uint8_t *this_frame_mb_stats;
   FIRSTPASS_MB_STATS firstpass_mb_stats;
 #endif
+
+  FP_MB_FLOAT_STATS *fp_mb_float_stats;
+
   // An indication of the content type of the current frame
   FRAME_CONTENT_TYPE fr_content_type;
 
@@ -122,7 +158,7 @@ typedef struct {
   int64_t kf_group_bits;
 
   // Error score of frames still to be coded in kf group
-  int64_t kf_group_error_left;
+  double kf_group_error_left;
 
   double bpm_factor;
   int rolling_arf_group_target_bits;
@@ -142,12 +178,20 @@ typedef struct {
 } TWO_PASS;
 
 struct VP9_COMP;
+struct ThreadData;
+struct TileDataEnc;
 
 void vp9_init_first_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
 void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);
 void vp9_end_first_pass(struct VP9_COMP *cpi);
 
+void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi,
+                                       struct ThreadData *td,
+                                       FIRSTPASS_DATA *fp_acc_data,
+                                       struct TileDataEnc *tile_data,
+                                       MV *best_ref_mv, int mb_row);
+
 void vp9_init_second_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
 void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
diff --git a/libs/libvpx/vp9/encoder/vp9_frame_scale.c b/libs/libvpx/vp9/encoder/vp9_frame_scale.c
new file mode 100644
index 0000000000..a410d0407f
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_frame_scale.c
@@ -0,0 +1,116 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_scale/yv12config.h"
+
+void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
+                                  YV12_BUFFER_CONFIG *dst,
+                                  INTERP_FILTER filter_type, int phase_scaler) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
+                                   src->v_buffer };
+  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
+  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
+  const InterpKernel *const kernel = vp9_filter_kernels[filter_type];
+  int x, y, i;
+
+#if HAVE_SSSE3 || HAVE_NEON
+  // TODO(linfengz): The 4:3 specialized C code is disabled by default since
+  // it's much slower than the general version which calls vpx_scaled_2d() even
+  // if vpx_scaled_2d() is not optimized. It will only be enabled as a reference
+  // for the platforms which have faster optimization.
+  if (4 * dst->y_crop_width == 3 * src_w &&
+      4 * dst->y_crop_height == 3 * src_h) {
+    // Specialize 4 to 3 scaling.
+    // Example pixel locations.
+    // (O: Original pixel. S: Scaled pixel. X: Overlapped pixel.)
+    //      phase_scaler = 0               |      phase_scaler = 8
+    //                                     |
+    //      X     O S   O   S O     X      |      O     O     O     O     O
+    //                                     |
+    //                                     |
+    //                                     |         S       S       S
+    //                                     |
+    //                                     |
+    //      O     O     O     O     O      |      O     O     O     O     O
+    //                                     |
+    //      S       S       S       S      |
+    //                                     |
+    //                                     |
+    //                                     |         S       S       S
+    //      O     O     O     O     O      |      O     O     O     O     O
+    //                                     |
+    //                                     |
+    //                                     |
+    //      S       S       S       S      |
+    //                                     |
+    //      O     O     O     O     O      |      O     O     O     O     O
+    //                                     |         S       S       S
+    //                                     |
+    //                                     |
+    //                                     |
+    //                                     |
+    //      X     O S   O   S O     X      |      O     O     O     O     O
+
+    const int dst_ws[3] = { dst->y_crop_width, dst->uv_crop_width,
+                            dst->uv_crop_width };
+    const int dst_hs[3] = { dst->y_crop_height, dst->uv_crop_height,
+                            dst->uv_crop_height };
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      const int dst_w = dst_ws[i];
+      const int dst_h = dst_hs[i];
+      const int src_stride = src_strides[i];
+      const int dst_stride = dst_strides[i];
+      for (y = 0; y < dst_h; y += 3) {
+        for (x = 0; x < dst_w; x += 3) {
+          const uint8_t *src_ptr = srcs[i] + 4 * y / 3 * src_stride + 4 * x / 3;
+          uint8_t *dst_ptr = dsts[i] + y * dst_stride + x;
+
+          // Must call c function because its optimization doesn't support 3x3.
+          vpx_scaled_2d_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                          phase_scaler, 64 / 3, phase_scaler, 64 / 3, 3, 3);
+        }
+      }
+    }
+  } else
+#endif
+  {
+    const int dst_w = dst->y_crop_width;
+    const int dst_h = dst->y_crop_height;
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      const int factor = (i == 0 || i == 3 ? 1 : 2);
+      const int src_stride = src_strides[i];
+      const int dst_stride = dst_strides[i];
+      for (y = 0; y < dst_h; y += 16) {
+        const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
+        for (x = 0; x < dst_w; x += 16) {
+          const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
+          const uint8_t *src_ptr = srcs[i] +
+                                   (y / factor) * src_h / dst_h * src_stride +
+                                   (x / factor) * src_w / dst_w;
+          uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                        x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                        16 * src_h / dst_h, 16 / factor, 16 / factor);
+        }
+      }
+    }
+  }
+
+  vpx_extend_frame_borders(dst);
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_job_queue.h b/libs/libvpx/vp9/encoder/vp9_job_queue.h
new file mode 100644
index 0000000000..89c08f207a
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_job_queue.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_JOB_QUEUE_H_
+#define VP9_ENCODER_VP9_JOB_QUEUE_H_
+
+typedef enum {
+  FIRST_PASS_JOB,
+  ENCODE_JOB,
+  ARNR_JOB,
+  NUM_JOB_TYPES,
+} JOB_TYPE;
+
+// Encode job parameters
+typedef struct {
+  int vert_unit_row_num;  // Index of the vertical unit row
+  int tile_col_id;        // tile col id within a tile
+  int tile_row_id;        // tile col id within a tile
+} JobNode;
+
+// Job queue element parameters
+typedef struct {
+  // Pointer to the next link in the job queue
+  void *next;
+
+  // Job information context of the module
+  JobNode job_info;
+} JobQueue;
+
+// Job queue handle
+typedef struct {
+  // Pointer to the next link in the job queue
+  void *next;
+
+  // Counter to store the number of jobs picked up for processing
+  int num_jobs_acquired;
+} JobQueueHandle;
+
+#endif  // VP9_ENCODER_VP9_JOB_QUEUE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_mbgraph.c b/libs/libvpx/vp9/encoder/vp9_mbgraph.c
index e000220b97..46d626def1 100644
--- a/libs/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/libs/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -45,10 +45,13 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv,
 
   mv_sf->search_method = HEX;
   vp9_full_pixel_search(cpi, x, BLOCK_16X16, &ref_full, step_param,
-                        x->errorperbit, cond_cost_list(cpi, cost_list), ref_mv,
-                        dst_mv, 0, 0);
+                        cpi->sf.mv.search_method, x->errorperbit,
+                        cond_cost_list(cpi, cost_list), ref_mv, dst_mv, 0, 0);
   mv_sf->search_method = old_search_method;
 
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
   {
@@ -66,9 +69,6 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv,
 
   vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
 
-  /* restore UMV window */
-  x->mv_limits = tmp_mv_limits;
-
   return vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                       xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 }
diff --git a/libs/libvpx/vp9/encoder/vp9_mcomp.c b/libs/libvpx/vp9/encoder/vp9_mcomp.c
index 2d9bcbda67..44f01be25a 100644
--- a/libs/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libs/libvpx/vp9/encoder/vp9_mcomp.c
@@ -21,6 +21,7 @@
 #include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_reconinter.h"
 
 #include "vp9/encoder/vp9_encoder.h"
@@ -52,6 +53,24 @@ void vp9_set_mv_search_range(MvLimits *mv_limits, const MV *mv) {
   if (mv_limits->row_max > row_max) mv_limits->row_max = row_max;
 }
 
+void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits,
+                                    const MvLimits *umv_window_limits,
+                                    const MV *ref_mv) {
+  subpel_mv_limits->col_min = VPXMAX(umv_window_limits->col_min * 8,
+                                     ref_mv->col - MAX_FULL_PEL_VAL * 8);
+  subpel_mv_limits->col_max = VPXMIN(umv_window_limits->col_max * 8,
+                                     ref_mv->col + MAX_FULL_PEL_VAL * 8);
+  subpel_mv_limits->row_min = VPXMAX(umv_window_limits->row_min * 8,
+                                     ref_mv->row - MAX_FULL_PEL_VAL * 8);
+  subpel_mv_limits->row_max = VPXMIN(umv_window_limits->row_max * 8,
+                                     ref_mv->row + MAX_FULL_PEL_VAL * 8);
+
+  subpel_mv_limits->col_min = VPXMAX(MV_LOW + 1, subpel_mv_limits->col_min);
+  subpel_mv_limits->col_max = VPXMIN(MV_UPP - 1, subpel_mv_limits->col_max);
+  subpel_mv_limits->row_min = VPXMAX(MV_LOW + 1, subpel_mv_limits->row_min);
+  subpel_mv_limits->row_max = VPXMIN(MV_UPP - 1, subpel_mv_limits->row_max);
+}
+
 int vp9_init_search_range(int size) {
   int sr = 0;
   // Minimum search size no matter what the passed in value.
@@ -82,10 +101,8 @@ static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
                        int *mvcost[2], int error_per_bit) {
   if (mvcost) {
     const MV diff = { mv->row - ref->row, mv->col - ref->col };
-    // This product sits at a 32-bit ceiling right now and any additional
-    // accuracy in either bit cost or error cost will cause it to overflow.
-    return ROUND_POWER_OF_TWO(
-        (unsigned)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+    return (int)ROUND64_POWER_OF_TWO(
+        (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
         RDDIV_BITS + VP9_PROB_COST_SHIFT - RD_EPB_SHIFT +
             PIXEL_TRANSFORM_ERROR_SCALE);
   }
@@ -138,17 +155,6 @@ void vp9_init3smotion_compensation(search_site_config *cfg, int stride) {
   cfg->total_steps = ss_count / cfg->searches_per_step;
 }
 
-/* Estimated (square) error cost of a motion vector (r,c). The 14 scale comes
- * from the same math as in mv_err_cost(). */
-#define MVC(r, c)                                                 \
-  (mvcost                                                         \
-       ? ((unsigned)(mvjcost[((r) != rr) * 2 + ((c) != rc)] +     \
-                     mvcost[0][((r)-rr)] + mvcost[1][((c)-rc)]) * \
-              error_per_bit +                                     \
-          8192) >>                                                \
-             14                                                   \
-       : 0)
-
 // convert motion vector component to offset for sv[a]f calc
 static INLINE int sp(int x) { return x & 7; }
 
@@ -161,6 +167,8 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
 #define CHECK_BETTER(v, r, c)                                                \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
     int64_t tmpmse;                                                          \
+    const MV mv = { r, c };                                                  \
+    const MV ref_mv = { rr, rc };                                            \
     if (second_pred == NULL) {                                               \
       thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
                          src_stride, &sse);                                  \
@@ -169,7 +177,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
                           src_stride, &sse, second_pred);                    \
     }                                                                        \
     tmpmse = thismse;                                                        \
-    tmpmse += MVC(r, c);                                                     \
+    tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);     \
     if (tmpmse >= INT_MAX) {                                                 \
       v = INT_MAX;                                                           \
     } else if ((v = (uint32_t)tmpmse) < besterr) {                           \
@@ -186,13 +194,16 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
 /* checks if (r, c) has better score than previous best */
 #define CHECK_BETTER(v, r, c)                                                \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
+    const MV mv = { r, c };                                                  \
+    const MV ref_mv = { rr, rc };                                            \
     if (second_pred == NULL)                                                 \
       thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
                          src_stride, &sse);                                  \
     else                                                                     \
       thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
                           src_stride, &sse, second_pred);                    \
-    if ((v = MVC(r, c) + thismse) < besterr) {                               \
+    if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +     \
+             thismse) < besterr) {                                           \
       besterr = v;                                                           \
       br = r;                                                                \
       bc = c;                                                                \
@@ -273,34 +284,38 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
     }                                           \
   }
 
-#define SETUP_SUBPEL_SEARCH                                                \
-  const uint8_t *const z = x->plane[0].src.buf;                            \
-  const int src_stride = x->plane[0].src.stride;                           \
-  const MACROBLOCKD *xd = &x->e_mbd;                                       \
-  unsigned int besterr = INT_MAX;                                          \
-  unsigned int sse;                                                        \
-  unsigned int whichdir;                                                   \
-  int thismse;                                                             \
-  const unsigned int halfiters = iters_per_step;                           \
-  const unsigned int quarteriters = iters_per_step;                        \
-  const unsigned int eighthiters = iters_per_step;                         \
-  const int y_stride = xd->plane[0].pre[0].stride;                         \
-  const int offset = bestmv->row * y_stride + bestmv->col;                 \
-  const uint8_t *const y = xd->plane[0].pre[0].buf;                        \
-                                                                           \
-  int rr = ref_mv->row;                                                    \
-  int rc = ref_mv->col;                                                    \
-  int br = bestmv->row * 8;                                                \
-  int bc = bestmv->col * 8;                                                \
-  int hstep = 4;                                                           \
-  const int minc = VPXMAX(x->mv_limits.col_min * 8, ref_mv->col - MV_MAX); \
-  const int maxc = VPXMIN(x->mv_limits.col_max * 8, ref_mv->col + MV_MAX); \
-  const int minr = VPXMAX(x->mv_limits.row_min * 8, ref_mv->row - MV_MAX); \
-  const int maxr = VPXMIN(x->mv_limits.row_max * 8, ref_mv->row + MV_MAX); \
-  int tr = br;                                                             \
-  int tc = bc;                                                             \
-                                                                           \
-  bestmv->row *= 8;                                                        \
+#define SETUP_SUBPEL_SEARCH                                                 \
+  const uint8_t *const z = x->plane[0].src.buf;                             \
+  const int src_stride = x->plane[0].src.stride;                            \
+  const MACROBLOCKD *xd = &x->e_mbd;                                        \
+  unsigned int besterr = UINT_MAX;                                          \
+  unsigned int sse;                                                         \
+  unsigned int whichdir;                                                    \
+  int thismse;                                                              \
+  const unsigned int halfiters = iters_per_step;                            \
+  const unsigned int quarteriters = iters_per_step;                         \
+  const unsigned int eighthiters = iters_per_step;                          \
+  const int y_stride = xd->plane[0].pre[0].stride;                          \
+  const int offset = bestmv->row * y_stride + bestmv->col;                  \
+  const uint8_t *const y = xd->plane[0].pre[0].buf;                         \
+                                                                            \
+  int rr = ref_mv->row;                                                     \
+  int rc = ref_mv->col;                                                     \
+  int br = bestmv->row * 8;                                                 \
+  int bc = bestmv->col * 8;                                                 \
+  int hstep = 4;                                                            \
+  int minc, maxc, minr, maxr;                                               \
+  int tr = br;                                                              \
+  int tc = bc;                                                              \
+  MvLimits subpel_mv_limits;                                                \
+                                                                            \
+  vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); \
+  minc = subpel_mv_limits.col_min;                                          \
+  maxc = subpel_mv_limits.col_max;                                          \
+  minr = subpel_mv_limits.row_min;                                          \
+  maxr = subpel_mv_limits.row_max;                                          \
+                                                                            \
+  bestmv->row *= 8;                                                         \
   bestmv->col *= 8;
 
 static unsigned int setup_center_error(
@@ -346,7 +361,7 @@ static unsigned int setup_center_error(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-static INLINE int divide_and_round(const int n, const int d) {
+static INLINE int64_t divide_and_round(const int64_t n, const int64_t d) {
   return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
 }
 
@@ -364,10 +379,13 @@ static INLINE int is_cost_list_wellbehaved(int *cost_list) {
 // y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
 // The code below is an integerized version of that.
 static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
-  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
-                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
-  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
-                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+  const int64_t x0 = (int64_t)cost_list[1] - cost_list[3];
+  const int64_t y0 = cost_list[1] - 2 * (int64_t)cost_list[0] + cost_list[3];
+  const int64_t x1 = (int64_t)cost_list[4] - cost_list[2];
+  const int64_t y1 = cost_list[4] - 2 * (int64_t)cost_list[0] + cost_list[2];
+  const int b = 1 << (bits - 1);
+  *ic = (int)divide_and_round(x0 * b, y0);
+  *ir = (int)divide_and_round(x1 * b, y1);
 }
 
 uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv,
@@ -401,10 +419,6 @@ uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv,
   (void)thismse;
   (void)cost_list;
 
-  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return UINT_MAX;
-
   return besterr;
 }
 
@@ -430,7 +444,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
       cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
     int ir, ic;
-    unsigned int minpt;
+    unsigned int minpt = INT_MAX;
     get_cost_surf_min(cost_list, &ir, &ic, 2);
     if (ir != 0 || ic != 0) {
       CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
@@ -470,10 +484,6 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
   bestmv->row = br;
   bestmv->col = bc;
 
-  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return INT_MAX;
-
   return besterr;
 }
 
@@ -534,10 +544,6 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_more(
   bestmv->row = br;
   bestmv->col = bc;
 
-  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return UINT_MAX;
-
   return besterr;
 }
 
@@ -620,10 +626,6 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned(
   bestmv->row = br;
   bestmv->col = bc;
 
-  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return INT_MAX;
-
   return besterr;
 }
 
@@ -646,7 +648,7 @@ uint32_t vp9_find_best_sub_pixel_tree(
   const uint8_t *const src_address = z;
   const int src_stride = x->plane[0].src.stride;
   const MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int besterr = INT_MAX;
+  unsigned int besterr = UINT_MAX;
   unsigned int sse;
   int thismse;
   const int y_stride = xd->plane[0].pre[0].stride;
@@ -659,16 +661,21 @@ uint32_t vp9_find_best_sub_pixel_tree(
   int bc = bestmv->col * 8;
   int hstep = 4;
   int iter, round = 3 - forced_stop;
-  const int minc = VPXMAX(x->mv_limits.col_min * 8, ref_mv->col - MV_MAX);
-  const int maxc = VPXMIN(x->mv_limits.col_max * 8, ref_mv->col + MV_MAX);
-  const int minr = VPXMAX(x->mv_limits.row_min * 8, ref_mv->row - MV_MAX);
-  const int maxr = VPXMIN(x->mv_limits.row_max * 8, ref_mv->row + MV_MAX);
+
+  int minc, maxc, minr, maxr;
   int tr = br;
   int tc = bc;
   const MV *search_step = search_step_table;
   int idx, best_idx = -1;
   unsigned int cost_array[5];
   int kr, kc;
+  MvLimits subpel_mv_limits;
+
+  vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv);
+  minc = subpel_mv_limits.col_min;
+  maxc = subpel_mv_limits.col_max;
+  minr = subpel_mv_limits.row_min;
+  maxr = subpel_mv_limits.row_max;
 
   if (!(allow_hp && use_mv_hp(ref_mv)))
     if (round == 3) round = 2;
@@ -708,7 +715,7 @@ uint32_t vp9_find_best_sub_pixel_tree(
           *sse1 = sse;
         }
       } else {
-        cost_array[idx] = INT_MAX;
+        cost_array[idx] = UINT_MAX;
       }
     }
 
@@ -737,7 +744,7 @@ uint32_t vp9_find_best_sub_pixel_tree(
         *sse1 = sse;
       }
     } else {
-      cost_array[idx] = INT_MAX;
+      cost_array[idx] = UINT_MAX;
     }
 
     if (best_idx < 4 && best_idx >= 0) {
@@ -769,14 +776,9 @@ uint32_t vp9_find_best_sub_pixel_tree(
   bestmv->row = br;
   bestmv->col = bc;
 
-  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return INT_MAX;
-
   return besterr;
 }
 
-#undef MVC
 #undef CHECK_BETTER
 
 static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col,
@@ -858,13 +860,12 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv,
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
 //
-static int vp9_pattern_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
-                              int sad_per_bit, int do_init_search,
-                              int *cost_list, const vp9_variance_fn_ptr_t *vfp,
-                              int use_mvcost, const MV *center_mv, MV *best_mv,
-                              const int num_candidates[MAX_PATTERN_SCALES],
-                              const MV candidates[MAX_PATTERN_SCALES]
-                                                 [MAX_PATTERN_CANDIDATES]) {
+static int vp9_pattern_search(
+    const MACROBLOCK *x, MV *ref_mv, int search_param, int sad_per_bit,
+    int do_init_search, int *cost_list, const vp9_variance_fn_ptr_t *vfp,
+    int use_mvcost, const MV *center_mv, MV *best_mv,
+    const int num_candidates[MAX_PATTERN_SCALES],
+    const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
     10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
@@ -1822,7 +1823,9 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-  {
+  // TODO(jingning): Implement integral projection functions for high bit-depth
+  // setting and remove this part of code.
+  if (xd->bd != 8) {
     unsigned int this_sad;
     tmp_mv->row = 0;
     tmp_mv->col = 0;
@@ -1998,9 +2001,6 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
   int range = sf->mesh_patterns[0].range;
   int baseline_interval_divisor;
 
-  // Keep track of number of exhaustive calls (this frame in this thread).
-  ++(*x->ex_search_count_ptr);
-
   // Trap illegal values for interval and range for this function.
   if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
       (interval > range))
@@ -2042,197 +2042,6 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
   return bestsme;
 }
 
-int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp9_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r, c;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
-
-  for (r = row_min; r < row_max; ++r) {
-    for (c = col_min; c < col_max; ++c) {
-      const MV mv = { r, c };
-      const int sad =
-          fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
-                      in_what->stride) +
-          mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-      if (sad < best_sad) {
-        best_sad = sad;
-        *best_mv = mv;
-      }
-    }
-  }
-  return best_sad;
-}
-
-int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp9_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
-
-  for (r = row_min; r < row_max; ++r) {
-    int c = col_min;
-    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
-
-    if (fn_ptr->sdx3f != NULL) {
-      while ((c + 2) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[3]);
-
-        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
-
-        for (i = 0; i < 3; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-          ++check_here;
-          ++c;
-        }
-      }
-    }
-
-    while (c < col_max) {
-      unsigned int sad =
-          fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
-      if (sad < best_sad) {
-        const MV mv = { r, c };
-        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-        if (sad < best_sad) {
-          best_sad = sad;
-          *best_mv = mv;
-        }
-      }
-      ++check_here;
-      ++c;
-    }
-  }
-
-  return best_sad;
-}
-
-int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp9_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
-
-  for (r = row_min; r < row_max; ++r) {
-    int c = col_min;
-    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
-
-    if (fn_ptr->sdx8f != NULL) {
-      while ((c + 7) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[8]);
-
-        fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
-
-        for (i = 0; i < 8; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-          ++check_here;
-          ++c;
-        }
-      }
-    }
-
-    if (fn_ptr->sdx3f != NULL) {
-      while ((c + 2) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[3]);
-
-        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
-
-        for (i = 0; i < 3; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-          ++check_here;
-          ++c;
-        }
-      }
-    }
-
-    while (c < col_max) {
-      unsigned int sad =
-          fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
-      if (sad < best_sad) {
-        const MV mv = { r, c };
-        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-        if (sad < best_sad) {
-          best_sad = sad;
-          *best_mv = mv;
-        }
-      }
-      ++check_here;
-      ++c;
-    }
-  }
-
-  return best_sad;
-}
-
 int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                             int search_range,
                             const vp9_variance_fn_ptr_t *fn_ptr,
@@ -2318,11 +2127,14 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad =
+  unsigned int best_sad = INT_MAX;
+  int i, j;
+  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  best_sad =
       fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
                    in_what->stride, second_pred) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
-  int i, j;
 
   for (i = 0; i < search_range; ++i) {
     int best_site = -1;
@@ -2355,24 +2167,12 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
   return best_sad;
 }
 
-#define MIN_EX_SEARCH_LIMIT 128
-static int is_exhaustive_allowed(VP9_COMP *cpi, MACROBLOCK *x) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const int max_ex =
-      VPXMAX(MIN_EX_SEARCH_LIMIT,
-             (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
-
-  return sf->allow_exhaustive_searches &&
-         (sf->exhaustive_searches_thresh < INT_MAX) &&
-         (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref;
-}
-
 int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                          MV *mvp_full, int step_param, int error_per_bit,
-                          int *cost_list, const MV *ref_mv, MV *tmp_mv,
-                          int var_max, int rd) {
+                          MV *mvp_full, int step_param, int search_method,
+                          int error_per_bit, int *cost_list, const MV *ref_mv,
+                          MV *tmp_mv, int var_max, int rd) {
   const SPEED_FEATURES *const sf = &cpi->sf;
-  const SEARCH_METHODS method = sf->mv.search_method;
+  const SEARCH_METHODS method = (SEARCH_METHODS)search_method;
   vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
   int var = 0;
   if (cost_list) {
@@ -2383,9 +2183,6 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     cost_list[4] = INT_MAX;
   }
 
-  // Keep track of number of searches (this frame in this thread).
-  ++(*x->m_search_count_ptr);
-
   switch (method) {
     case FAST_DIAMOND:
       var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
@@ -2413,7 +2210,8 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                                cost_list, fn_ptr, ref_mv, tmp_mv);
 
       // Should we allow a follow on exhaustive search?
-      if (is_exhaustive_allowed(cpi, x)) {
+      if ((sf->exhaustive_searches_thresh < INT_MAX) &&
+          !cpi->rc.is_src_frame_alt_ref) {
         int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
         exhuastive_thr >>=
             8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
@@ -2440,3 +2238,85 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
   return var;
 }
+
+// Note(yunqingwang): The following 2 functions are only used in the motion
+// vector unit test, which return extreme motion vectors allowed by the MV
+// limits.
+#define COMMON_MV_TEST \
+  SETUP_SUBPEL_SEARCH; \
+                       \
+  (void)error_per_bit; \
+  (void)vfp;           \
+  (void)z;             \
+  (void)src_stride;    \
+  (void)y;             \
+  (void)y_stride;      \
+  (void)second_pred;   \
+  (void)w;             \
+  (void)h;             \
+  (void)offset;        \
+  (void)mvjcost;       \
+  (void)mvcost;        \
+  (void)sse1;          \
+  (void)distortion;    \
+                       \
+  (void)halfiters;     \
+  (void)quarteriters;  \
+  (void)eighthiters;   \
+  (void)whichdir;      \
+  (void)allow_hp;      \
+  (void)forced_stop;   \
+  (void)hstep;         \
+  (void)rr;            \
+  (void)rc;            \
+                       \
+  (void)tr;            \
+  (void)tc;            \
+  (void)sse;           \
+  (void)thismse;       \
+  (void)cost_list;
+
+// Return the maximum MV.
+uint32_t vp9_return_max_sub_pixel_mv(
+    const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+    int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
+    int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+    uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+    int h) {
+  COMMON_MV_TEST;
+
+  (void)minr;
+  (void)minc;
+
+  bestmv->row = maxr;
+  bestmv->col = maxc;
+  besterr = 0;
+
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
+  lower_mv_precision(bestmv, allow_hp && use_mv_hp(ref_mv));
+
+  return besterr;
+}
+// Return the minimum MV.
+uint32_t vp9_return_min_sub_pixel_mv(
+    const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+    int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
+    int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+    uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+    int h) {
+  COMMON_MV_TEST;
+
+  (void)maxr;
+  (void)maxc;
+
+  bestmv->row = minr;
+  bestmv->col = minc;
+  besterr = 0;
+
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
+  lower_mv_precision(bestmv, allow_hp && use_mv_hp(ref_mv));
+
+  return besterr;
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_mcomp.h b/libs/libvpx/vp9/encoder/vp9_mcomp.h
index 2726b9e230..b8db2c3536 100644
--- a/libs/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/libs/libvpx/vp9/encoder/vp9_mcomp.h
@@ -81,6 +81,8 @@ extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more;
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_evenmore;
 extern fractional_mv_step_fp vp9_skip_sub_pixel_tree;
+extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv;
+extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv;
 
 typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv,
                                     int sad_per_bit, int distance,
@@ -105,9 +107,13 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
 struct VP9_COMP;
 
 int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                          MV *mvp_full, int step_param, int error_per_bit,
-                          int *cost_list, const MV *ref_mv, MV *tmp_mv,
-                          int var_max, int rd);
+                          MV *mvp_full, int step_param, int search_method,
+                          int error_per_bit, int *cost_list, const MV *ref_mv,
+                          MV *tmp_mv, int var_max, int rd);
+
+void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits,
+                                    const MvLimits *umv_window_limits,
+                                    const MV *ref_mv);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libs/libvpx/vp9/encoder/vp9_multi_thread.c b/libs/libvpx/vp9/encoder/vp9_multi_thread.c
new file mode 100644
index 0000000000..da06fb151d
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_multi_thread.c
@@ -0,0 +1,313 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_multi_thread.h"
+
+void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
+                               int tile_id) {
+  RowMTInfo *row_mt_info;
+  JobQueueHandle *job_queue_hdl = NULL;
+  void *next = NULL;
+  JobNode *job_info = NULL;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_handle = NULL;
+#endif
+
+  row_mt_info = (RowMTInfo *)(&multi_thread_ctxt->row_mt_info[tile_id]);
+  job_queue_hdl = (JobQueueHandle *)&row_mt_info->job_queue_hdl;
+#if CONFIG_MULTITHREAD
+  mutex_handle = &row_mt_info->job_mutex;
+#endif
+
+// lock the mutex for queue access
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(mutex_handle);
+#endif
+  next = job_queue_hdl->next;
+  if (NULL != next) {
+    JobQueue *job_queue = (JobQueue *)next;
+    job_info = &job_queue->job_info;
+    // Update the next job in the queue
+    job_queue_hdl->next = job_queue->next;
+    job_queue_hdl->num_jobs_acquired++;
+  }
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(mutex_handle);
+#endif
+
+  return job_info;
+}
+
+void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
+  struct VP9Common *cm = &cpi->common;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int tile_row, tile_col;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  int jobs_per_tile_col, total_jobs;
+
+  jobs_per_tile_col = VPXMAX(cm->mb_rows, sb_rows);
+  // Calculate the total number of jobs
+  total_jobs = jobs_per_tile_col * tile_cols;
+
+  multi_thread_ctxt->allocated_tile_cols = tile_cols;
+  multi_thread_ctxt->allocated_tile_rows = tile_rows;
+  multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;
+
+  multi_thread_ctxt->job_queue =
+      (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue));
+
+#if CONFIG_MULTITHREAD
+  // Create mutex for each tile
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col];
+    pthread_mutex_init(&row_mt_info->job_mutex, NULL);
+  }
+#endif
+
+  // Allocate memory for row based multi-threading
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    TileDataEnc *this_tile = &cpi->tile_data[tile_col];
+    vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col);
+    if (cpi->sf.adaptive_rd_thresh_row_mt) {
+      const int sb_rows =
+          (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1;
+      int i;
+      this_tile->row_base_thresh_freq_fact =
+          (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
+                            sizeof(*(this_tile->row_base_thresh_freq_fact)));
+      for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
+        this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT;
+    }
+  }
+
+  // Assign the sync pointer of tile row zero for every tile row > 0
+  for (tile_row = 1; tile_row < tile_rows; tile_row++) {
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+      TileDataEnc *this_col_tile = &cpi->tile_data[tile_col];
+      this_tile->row_mt_sync = this_col_tile->row_mt_sync;
+    }
+  }
+
+  // Calculate the number of vertical units in the given tile row
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols];
+    TileInfo *tile_info = &this_tile->tile_info;
+    multi_thread_ctxt->num_tile_vert_sbs[tile_row] =
+        get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
+  }
+}
+
+void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) {
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int tile_col;
+#if CONFIG_MULTITHREAD
+  int tile_row;
+#endif
+
+  // Deallocate memory for job queue
+  if (multi_thread_ctxt->job_queue) vpx_free(multi_thread_ctxt->job_queue);
+
+#if CONFIG_MULTITHREAD
+  // Destroy mutex for each tile
+  for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+       tile_col++) {
+    RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col];
+    if (row_mt_info) pthread_mutex_destroy(&row_mt_info->job_mutex);
+  }
+#endif
+
+  // Free row based multi-threading sync memory
+  for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+       tile_col++) {
+    TileDataEnc *this_tile = &cpi->tile_data[tile_col];
+    vp9_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+  }
+
+#if CONFIG_MULTITHREAD
+  for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows;
+       tile_row++) {
+    for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+         tile_col++) {
+      TileDataEnc *this_tile =
+          &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
+                          tile_col];
+      if (cpi->sf.adaptive_rd_thresh_row_mt) {
+        if (this_tile->row_base_thresh_freq_fact != NULL) {
+          vpx_free(this_tile->row_base_thresh_freq_fact);
+          this_tile->row_base_thresh_freq_fact = NULL;
+        }
+      }
+    }
+  }
+#endif
+}
+
+void vp9_multi_thread_tile_init(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  int i;
+
+  for (i = 0; i < tile_cols; i++) {
+    TileDataEnc *this_tile = &cpi->tile_data[i];
+    int jobs_per_tile_col = cpi->oxcf.pass == 1 ? cm->mb_rows : sb_rows;
+
+    // Initialize cur_col to -1 for all rows.
+    memset(this_tile->row_mt_sync.cur_col, -1,
+           sizeof(*this_tile->row_mt_sync.cur_col) * jobs_per_tile_col);
+    vp9_zero(this_tile->fp_data);
+    this_tile->fp_data.image_data_start_row = INVALID_ROW;
+  }
+}
+
+void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,
+                               int tile_cols, int num_workers) {
+  int tile_id = 0;
+  int i;
+
+  // Allocating the threads for the tiles
+  for (i = 0; i < num_workers; i++) {
+    multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++;
+    if (tile_id == tile_cols) tile_id = 0;
+  }
+}
+
+int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt,
+                             int cur_tile_id) {
+  RowMTInfo *row_mt_info;
+  JobQueueHandle *job_queue_hndl;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex;
+#endif
+  int num_jobs_remaining;
+
+  row_mt_info = &multi_thread_ctxt->row_mt_info[cur_tile_id];
+  job_queue_hndl = &row_mt_info->job_queue_hdl;
+#if CONFIG_MULTITHREAD
+  mutex = &row_mt_info->job_mutex;
+#endif
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(mutex);
+#endif
+  num_jobs_remaining =
+      multi_thread_ctxt->jobs_per_tile_col - job_queue_hndl->num_jobs_acquired;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(mutex);
+#endif
+
+  return (num_jobs_remaining);
+}
+
+void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type) {
+  VP9_COMMON *const cm = &cpi->common;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  JobQueue *job_queue = multi_thread_ctxt->job_queue;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  int job_row_num, jobs_per_tile, jobs_per_tile_col, total_jobs;
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  int tile_col, i;
+
+  jobs_per_tile_col = (job_type != ENCODE_JOB) ? cm->mb_rows : sb_rows;
+  total_jobs = jobs_per_tile_col * tile_cols;
+
+  multi_thread_ctxt->jobs_per_tile_col = jobs_per_tile_col;
+  // memset the entire job queue buffer to zero
+  memset(job_queue, 0, total_jobs * sizeof(JobQueue));
+
+  // Job queue preparation
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    RowMTInfo *tile_ctxt = &multi_thread_ctxt->row_mt_info[tile_col];
+    JobQueue *job_queue_curr, *job_queue_temp;
+    int tile_row = 0;
+
+    tile_ctxt->job_queue_hdl.next = (void *)job_queue;
+    tile_ctxt->job_queue_hdl.num_jobs_acquired = 0;
+
+    job_queue_curr = job_queue;
+    job_queue_temp = job_queue;
+
+    // loop over all the vertical rows
+    for (job_row_num = 0, jobs_per_tile = 0; job_row_num < jobs_per_tile_col;
+         job_row_num++, jobs_per_tile++) {
+      job_queue_curr->job_info.vert_unit_row_num = job_row_num;
+      job_queue_curr->job_info.tile_col_id = tile_col;
+      job_queue_curr->job_info.tile_row_id = tile_row;
+      job_queue_curr->next = (void *)(job_queue_temp + 1);
+      job_queue_curr = ++job_queue_temp;
+
+      if (ENCODE_JOB == job_type) {
+        if (jobs_per_tile >=
+            multi_thread_ctxt->num_tile_vert_sbs[tile_row] - 1) {
+          tile_row++;
+          jobs_per_tile = -1;
+        }
+      }
+    }
+
+    // Set the last pointer to NULL
+    job_queue_curr += -1;
+    job_queue_curr->next = (void *)NULL;
+
+    // Move to the next tile
+    job_queue += jobs_per_tile_col;
+  }
+
+  for (i = 0; i < cpi->num_workers; i++) {
+    EncWorkerData *thread_data;
+    thread_data = &cpi->tile_thr_data[i];
+    thread_data->thread_id = i;
+
+    for (tile_col = 0; tile_col < tile_cols; tile_col++)
+      thread_data->tile_completion_status[tile_col] = 0;
+  }
+}
+
+int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,
+                              int *tile_completion_status, int *cur_tile_id,
+                              int tile_cols) {
+  int tile_col;
+  int tile_id = -1;  // Stores the tile ID with minimum proc done
+  int max_num_jobs_remaining = 0;
+  int num_jobs_remaining;
+
+  // Mark the completion to avoid check in the loop
+  tile_completion_status[*cur_tile_id] = 1;
+  // Check for the status of all the tiles
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    if (tile_completion_status[tile_col] == 0) {
+      num_jobs_remaining =
+          vp9_get_job_queue_status(multi_thread_ctxt, tile_col);
+      // Mark the completion to avoid checks during future switches across tiles
+      if (num_jobs_remaining == 0) tile_completion_status[tile_col] = 1;
+      if (num_jobs_remaining > max_num_jobs_remaining) {
+        max_num_jobs_remaining = num_jobs_remaining;
+        tile_id = tile_col;
+      }
+    }
+  }
+
+  if (-1 == tile_id) {
+    return 1;
+  } else {
+    // Update the cur ID to the next tile ID that will be processed,
+    // which will be the least processed tile
+    *cur_tile_id = tile_id;
+    return 0;
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_multi_thread.h b/libs/libvpx/vp9/encoder/vp9_multi_thread.h
new file mode 100644
index 0000000000..bfc0c0ae4f
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_multi_thread.h
@@ -0,0 +1,38 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_MULTI_THREAD_H
+#define VP9_ENCODER_VP9_MULTI_THREAD_H
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_job_queue.h"
+
+void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
+                               int tile_id);
+
+void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type);
+
+int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt,
+                             int cur_tile_id);
+
+void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,
+                               int tile_cols, int num_workers);
+
+void vp9_multi_thread_tile_init(VP9_COMP *cpi);
+
+void vp9_row_mt_mem_alloc(VP9_COMP *cpi);
+
+void vp9_row_mt_mem_dealloc(VP9_COMP *cpi);
+
+int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,
+                              int *tile_completion_status, int *cur_tile_id,
+                              int tile_cols);
+
+#endif  // VP9_ENCODER_VP9_MULTI_THREAD_H
diff --git a/libs/libvpx/vp9/encoder/vp9_noise_estimate.c b/libs/libvpx/vp9/encoder/vp9_noise_estimate.c
index 0e5d8ade4a..276a0c7852 100644
--- a/libs/libvpx/vp9/encoder/vp9_noise_estimate.c
+++ b/libs/libvpx/vp9/encoder/vp9_noise_estimate.c
@@ -21,6 +21,15 @@
 #include "vp9/encoder/vp9_noise_estimate.h"
 #include "vp9/encoder/vp9_encoder.h"
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// For SVC: only do noise estimation on top spatial layer.
+static INLINE int noise_est_svc(const struct VP9_COMP *const cpi) {
+  return (!cpi->use_svc ||
+          (cpi->use_svc &&
+           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
+}
+#endif
+
 void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
   ne->enabled = 0;
   ne->level = kLowLow;
@@ -32,15 +41,22 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
   if (width * height >= 1920 * 1080) {
     ne->thresh = 200;
   } else if (width * height >= 1280 * 720) {
-    ne->thresh = 130;
+    ne->thresh = 140;
+  } else if (width * height >= 640 * 360) {
+    ne->thresh = 115;
   }
-  ne->num_frames_estimate = 20;
+  ne->num_frames_estimate = 15;
 }
 
 static int enable_noise_estimation(VP9_COMP *const cpi) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cpi->common.use_highbitdepth) return 0;
+#endif
 // Enable noise estimation if denoising is on.
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0) return 1;
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+      cpi->common.width >= 320 && cpi->common.height >= 180)
+    return 1;
 #endif
   // Only allow noise estimate under certain encoding mode.
   // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original.
@@ -49,8 +65,8 @@ static int enable_noise_estimation(VP9_COMP *const cpi) {
   if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
       cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
       cpi->resize_state == ORIG && cpi->resize_pending == 0 && !cpi->use_svc &&
-      cpi->oxcf.content != VP9E_CONTENT_SCREEN && cpi->common.width >= 640 &&
-      cpi->common.height >= 480)
+      cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+      cpi->common.width * cpi->common.height >= 640 * 360)
     return 1;
   else
     return 0;
@@ -92,6 +108,7 @@ NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
 void vp9_update_noise_estimate(VP9_COMP *const cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   NOISE_ESTIMATE *const ne = &cpi->noise_estimate;
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
   // Estimate of noise level every frame_period frames.
   int frame_period = 8;
   int thresh_consec_zeromv = 6;
@@ -99,17 +116,31 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
   unsigned int thresh_sum_spatial = (200 * 200) << 8;
   unsigned int thresh_spatial_var = (32 * 32) << 8;
   int min_blocks_estimate = cm->mi_rows * cm->mi_cols >> 7;
+  int frame_counter = cm->current_video_frame;
   // Estimate is between current source and last source.
   YV12_BUFFER_CONFIG *last_source = cpi->Last_Source;
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0) last_source = &cpi->denoiser.last_source;
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) {
+    last_source = &cpi->denoiser.last_source;
+    // Tune these thresholds for different resolutions when denoising is
+    // enabled.
+    if (cm->width > 640 && cm->width < 1920) {
+      thresh_consec_zeromv = 4;
+      thresh_sum_diff = 200;
+      thresh_sum_spatial = (120 * 120) << 8;
+      thresh_spatial_var = (48 * 48) << 8;
+    }
+  }
 #endif
   ne->enabled = enable_noise_estimation(cpi);
-  if (!ne->enabled || cm->current_video_frame % frame_period != 0 ||
-      last_source == NULL || ne->last_w != cm->width ||
-      ne->last_h != cm->height) {
+  if (cpi->svc.number_spatial_layers > 1)
+    frame_counter = cpi->svc.current_superframe;
+  if (!ne->enabled || frame_counter % frame_period != 0 ||
+      last_source == NULL ||
+      (cpi->svc.number_spatial_layers == 1 &&
+       (ne->last_w != cm->width || ne->last_h != cm->height))) {
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0)
+    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
       copy_frame(&cpi->denoiser.last_source, cpi->Source);
 #endif
     if (last_source != NULL) {
@@ -117,12 +148,18 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
       ne->last_h = cm->height;
     }
     return;
-  } else if (cpi->rc.avg_frame_low_motion < 50) {
+  } else if (cm->current_video_frame > 60 &&
+             cpi->rc.avg_frame_low_motion < (low_res ? 70 : 50)) {
     // Force noise estimation to 0 and denoiser off if content has high motion.
     ne->level = kLowLow;
+    ne->count = 0;
+    ne->num_frames_estimate = 10;
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0)
+    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+        cpi->svc.current_superframe > 1) {
       vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
+      copy_frame(&cpi->denoiser.last_source, cpi->Source);
+    }
 #endif
     return;
   } else {
@@ -162,43 +199,42 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
           int bl_index1 = bl_index + 1;
           int bl_index2 = bl_index + cm->mi_cols;
           int bl_index3 = bl_index2 + 1;
-          // Only consider blocks that are likely steady background. i.e, have
-          // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
-          // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
-          // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
           int consec_zeromv =
               VPXMIN(cpi->consec_zero_mv[bl_index],
                      VPXMIN(cpi->consec_zero_mv[bl_index1],
                             VPXMIN(cpi->consec_zero_mv[bl_index2],
                                    cpi->consec_zero_mv[bl_index3])));
-          int is_skin = 0;
-          if (cpi->use_skin_detection) {
-            is_skin =
-                vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
-                                       src_uvstride, bsize, consec_zeromv, 0);
-          }
-          if (frame_low_motion &&
-              cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
-              cpi->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
-              cpi->consec_zero_mv[bl_index2] > thresh_consec_zeromv &&
-              cpi->consec_zero_mv[bl_index3] > thresh_consec_zeromv &&
-              !is_skin) {
-            // Compute variance.
-            unsigned int sse;
-            unsigned int variance = cpi->fn_ptr[bsize].vf(
-                src_y, src_ystride, last_src_y, last_src_ystride, &sse);
-            // Only consider this block as valid for noise measurement if the
-            // average term (sse - variance = N * avg^{2}, N = 16X16) of the
-            // temporal residual is small (avoid effects from lighting change).
-            if ((sse - variance) < thresh_sum_diff) {
-              unsigned int sse2;
-              const unsigned int spatial_variance = cpi->fn_ptr[bsize].vf(
-                  src_y, src_ystride, const_source, 0, &sse2);
-              // Avoid blocks with high brightness and high spatial variance.
-              if ((sse2 - spatial_variance) < thresh_sum_spatial &&
-                  spatial_variance < thresh_spatial_var) {
-                avg_est += variance / ((spatial_variance >> 9) + 1);
-                num_samples++;
+          // Only consider blocks that are likely steady background. i.e, have
+          // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+          // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+          // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
+          if (frame_low_motion && consec_zeromv > thresh_consec_zeromv) {
+            int is_skin = 0;
+            if (cpi->use_skin_detection) {
+              is_skin =
+                  vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
+                                         src_uvstride, bsize, consec_zeromv, 0);
+            }
+            if (!is_skin) {
+              unsigned int sse;
+              // Compute variance.
+              unsigned int variance = cpi->fn_ptr[bsize].vf(
+                  src_y, src_ystride, last_src_y, last_src_ystride, &sse);
+              // Only consider this block as valid for noise measurement if the
+              // average term (sse - variance = N * avg^{2}, N = 16X16) of the
+              // temporal residual is small (avoid effects from lighting
+              // change).
+              if ((sse - variance) < thresh_sum_diff) {
+                unsigned int sse2;
+                const unsigned int spatial_variance = cpi->fn_ptr[bsize].vf(
+                    src_y, src_ystride, const_source, 0, &sse2);
+                // Avoid blocks with high brightness and high spatial variance.
+                if ((sse2 - spatial_variance) < thresh_sum_spatial &&
+                    spatial_variance < thresh_spatial_var) {
+                  avg_est += low_res ? variance >> 4
+                                     : variance / ((spatial_variance >> 9) + 1);
+                  num_samples++;
+                }
               }
             }
           }
@@ -230,14 +266,14 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
         ne->count = 0;
         ne->level = vp9_noise_estimate_extract_level(ne);
 #if CONFIG_VP9_TEMPORAL_DENOISING
-        if (cpi->oxcf.noise_sensitivity > 0)
+        if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
           vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
 #endif
       }
     }
   }
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0)
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
     copy_frame(&cpi->denoiser.last_source, cpi->Source);
 #endif
 }
diff --git a/libs/libvpx/vp9/encoder/vp9_picklpf.c b/libs/libvpx/vp9/encoder/vp9_picklpf.c
index 1583cc8abc..1c2c55b9e4 100644
--- a/libs/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/libs/libvpx/vp9/encoder/vp9_picklpf.c
@@ -181,6 +181,11 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
 #else
     int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
+        cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+        cpi->oxcf.content != VP9E_CONTENT_SCREEN && cm->frame_type != KEY_FRAME)
+      filt_guess = 5 * filt_guess >> 3;
+
     if (cm->frame_type == KEY_FRAME) filt_guess -= 4;
     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
   } else {
diff --git a/libs/libvpx/vp9/encoder/vp9_pickmode.c b/libs/libvpx/vp9/encoder/vp9_pickmode.c
index 76d2611d89..f2f323a282 100644
--- a/libs/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/libs/libvpx/vp9/encoder/vp9_pickmode.c
@@ -158,6 +158,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   const MvLimits tmp_mv_limits = x->mv_limits;
   int rv = 0;
   int cost_list[5];
+  int search_subpel = 1;
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
       vp9_get_scaled_ref_frame(cpi, ref);
   if (scaled_ref_frame) {
@@ -170,6 +171,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   }
   vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
 
+  // Limit motion vector for large lightning change.
+  if (cpi->oxcf.speed > 5 && x->lowvar_highsumdiff) {
+    x->mv_limits.col_min = VPXMAX(x->mv_limits.col_min, -10);
+    x->mv_limits.row_min = VPXMAX(x->mv_limits.row_min, -10);
+    x->mv_limits.col_max = VPXMIN(x->mv_limits.col_max, 10);
+    x->mv_limits.row_max = VPXMIN(x->mv_limits.row_max, 10);
+  }
+
   assert(x->mv_best_ref_index[ref] <= 2);
   if (x->mv_best_ref_index[ref] < 2)
     mvp_full = x->mbmi_ext->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
@@ -184,9 +193,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   else
     center_mv = tmp_mv->as_mv;
 
-  vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
-                        cond_cost_list(cpi, cost_list), &center_mv,
-                        &tmp_mv->as_mv, INT_MAX, 0);
+  if (x->sb_use_mv_part) {
+    tmp_mv->as_mv.row = x->sb_mvrow_part >> 3;
+    tmp_mv->as_mv.col = x->sb_mvcol_part >> 3;
+  } else {
+    vp9_full_pixel_search(
+        cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb,
+        cond_cost_list(cpi, cost_list), &center_mv, &tmp_mv->as_mv, INT_MAX, 0);
+  }
 
   x->mv_limits = tmp_mv_limits;
 
@@ -202,10 +216,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   rv =
       !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > best_rd_sofar);
 
-  if (rv) {
-    const int subpel_force_stop = use_base_mv && cpi->sf.base_mv_aggressive
-                                      ? 2
-                                      : cpi->sf.mv.subpel_force_stop;
+  // For SVC on non-reference frame, avoid subpel for (0, 0) motion.
+  if (cpi->use_svc && cpi->svc.non_reference_frame) {
+    if (mvp_full.row == 0 && mvp_full.col == 0) search_subpel = 0;
+  }
+
+  if (rv && search_subpel) {
+    int subpel_force_stop = cpi->sf.mv.subpel_force_stop;
+    if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = 2;
     cpi->find_fractional_mv_step(
         x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv,
         x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop,
@@ -296,11 +314,24 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
   }
 }
 
+// Adjust the ac_thr according to speed, width, height and normalized sum
+static int ac_thr_factor(const int speed, const int width, const int height,
+                         const int norm_sum) {
+  if (speed >= 8 && norm_sum < 5) {
+    if (width <= 640 && height <= 480)
+      return 4;
+    else
+      return 2;
+  }
+  return 1;
+}
+
 static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                     MACROBLOCK *x, MACROBLOCKD *xd,
                                     int *out_rate_sum, int64_t *out_dist_sum,
                                     unsigned int *var_y, unsigned int *sse_y,
-                                    int mi_row, int mi_col, int *early_term) {
+                                    int mi_row, int mi_col, int *early_term,
+                                    int *flag_preduv_computed) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -312,7 +343,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
   const uint32_t dc_quant = pd->dequant[0];
   const uint32_t ac_quant = pd->dequant[1];
   const int64_t dc_thr = dc_quant * dc_quant >> 6;
-  const int64_t ac_thr = ac_quant * ac_quant >> 6;
+  int64_t ac_thr = ac_quant * ac_quant >> 6;
   unsigned int var;
   int sum;
   int skip_dc = 0;
@@ -341,6 +372,20 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
   *var_y = var;
   *sse_y = sse;
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+      cpi->oxcf.speed > 5)
+    ac_thr = vp9_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level,
+                                     (abs(sum) >> (bw + bh)),
+                                     cpi->svc.temporal_layer_id);
+  else
+    ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width,
+                            cpi->common.height, abs(sum) >> (bw + bh));
+#else
+  ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width,
+                          cpi->common.height, abs(sum) >> (bw + bh));
+#endif
+
   if (cpi->common.tx_mode == TX_MODE_SELECT) {
     if (sse > (var << 2))
       tx_size = VPXMIN(max_txsize_lookup[bsize],
@@ -428,28 +473,33 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
 
     // Transform skipping test in UV planes.
     for (i = 1; i <= 2; i++) {
-      struct macroblock_plane *const p = &x->plane[i];
-      struct macroblockd_plane *const pd = &xd->plane[i];
-      const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd);
-      const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
-      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
-      const int uv_bw = b_width_log2_lookup[uv_bsize];
-      const int uv_bh = b_height_log2_lookup[uv_bsize];
-      const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
-                     (uv_bh - b_height_log2_lookup[unit_size]);
-      const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
-      const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
-      int j = i - 1;
+      if (cpi->oxcf.speed < 8 || x->color_sensitivity[i - 1]) {
+        struct macroblock_plane *const p = &x->plane[i];
+        struct macroblockd_plane *const pd = &xd->plane[i];
+        const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd);
+        const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
+        const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
+        const int uv_bw = b_width_log2_lookup[uv_bsize];
+        const int uv_bh = b_height_log2_lookup[uv_bsize];
+        const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
+                       (uv_bh - b_height_log2_lookup[unit_size]);
+        const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
+        const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
+        int j = i - 1;
 
-      vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
-      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(
-          p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]);
+        vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+        flag_preduv_computed[i - 1] = 1;
+        var_uv[j] = cpi->fn_ptr[uv_bsize].vf(
+            p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]);
 
-      if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
-          (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
-        skip_uv[j] = 1;
-      else
-        break;
+        if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+            (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+          skip_uv[j] = 1;
+        else
+          break;
+      } else {
+        skip_uv[i - 1] = 1;
+      }
     }
 
     // If the transform in YUV planes are skippable, the mode search checks
@@ -457,7 +507,6 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
     if (skip_uv[0] & skip_uv[1]) {
       *early_term = 1;
     }
-
     return;
   }
 
@@ -590,24 +639,9 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
   *out_dist_sum += dist << 4;
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
 static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
                       int *skippable, int64_t *sse, BLOCK_SIZE bsize,
-                      TX_SIZE tx_size) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int var_y, sse_y;
-
-  (void)tx_size;
-  model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, &var_y,
-                    &sse_y);
-  *sse = INT_MAX;
-  *skippable = 0;
-  return;
-}
-#else
-static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
-                      int *skippable, int64_t *sse, BLOCK_SIZE bsize,
-                      TX_SIZE tx_size) {
+                      TX_SIZE tx_size, int rd_computed) {
   MACROBLOCKD *xd = &x->e_mbd;
   const struct macroblockd_plane *pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
@@ -624,6 +658,39 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
   const int bw = 4 * num_4x4_w;
   const int bh = 4 * num_4x4_h;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  // TODO(jingning): Implement the high bit-depth Hadamard transforms and
+  // remove this check condition.
+  // TODO(marpan): Use this path (model_rd) for 8bit under certain conditions
+  // for now, as the vp9_quantize_fp below for highbitdepth build is slow.
+  if (xd->bd != 8 ||
+      (cpi->oxcf.speed > 5 && cpi->common.frame_type != KEY_FRAME &&
+       bsize < BLOCK_32X32)) {
+    unsigned int var_y, sse_y;
+    (void)tx_size;
+    if (!rd_computed)
+      model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist,
+                        &var_y, &sse_y);
+    *sse = INT_MAX;
+    *skippable = 0;
+    return;
+  }
+#endif
+
+  if (cpi->sf.use_simple_block_yrd && cpi->common.frame_type != KEY_FRAME &&
+      (bsize < BLOCK_32X32 ||
+       (cpi->use_svc &&
+        (bsize < BLOCK_32X32 || cpi->svc.temporal_layer_id > 0)))) {
+    unsigned int var_y, sse_y;
+    (void)tx_size;
+    if (!rd_computed)
+      model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist,
+                        &var_y, &sse_y);
+    *sse = INT_MAX;
+    *skippable = 0;
+    return;
+  }
+
   (void)cpi;
 
   // The max tx_size passed in is TX_16X16.
@@ -648,24 +715,21 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
 
         switch (tx_size) {
           case TX_16X16:
-            vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
-            vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, eob, scan_order->scan,
+            vpx_hadamard_16x16(src_diff, diff_stride, coeff);
+            vp9_quantize_fp(coeff, 256, x->skip_block, p->round_fp, p->quant_fp,
+                            qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
             break;
           case TX_8X8:
-            vpx_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
-            vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
-                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, eob, scan_order->scan,
+            vpx_hadamard_8x8(src_diff, diff_stride, coeff);
+            vp9_quantize_fp(coeff, 64, x->skip_block, p->round_fp, p->quant_fp,
+                            qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
             break;
           case TX_4X4:
-            x->fwd_txm4x4(src_diff, coeff, diff_stride);
-            vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, eob, scan_order->scan,
+            x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+            vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
+                            qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
             break;
           default: assert(0); break;
@@ -699,7 +763,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
         if (*eob == 1)
           this_rdc->rate += (int)abs(qcoeff[0]);
         else if (*eob > 1)
-          this_rdc->rate += vpx_satd((const int16_t *)qcoeff, step << 4);
+          this_rdc->rate += vpx_satd(qcoeff, step << 4);
 
         this_rdc->dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> 2;
       }
@@ -711,7 +775,6 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
   this_rdc->rate <<= (2 + VP9_PROB_COST_SHIFT);
   this_rdc->rate += (eob_cost << VP9_PROB_COST_SHIFT);
 }
-#endif
 
 static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
                                MACROBLOCK *x, MACROBLOCKD *xd,
@@ -799,13 +862,11 @@ static void free_pred_buffer(PRED_BUFFER *p) {
   if (p != NULL) p->in_use = 0;
 }
 
-static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                                 int mi_row, int mi_col,
-                                 MV_REFERENCE_FRAME ref_frame,
-                                 PREDICTION_MODE this_mode, unsigned int var_y,
-                                 unsigned int sse_y,
-                                 struct buf_2d yv12_mb[][MAX_MB_PLANE],
-                                 int *rate, int64_t *dist) {
+static void encode_breakout_test(
+    VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
+    MV_REFERENCE_FRAME ref_frame, PREDICTION_MODE this_mode, unsigned int var_y,
+    unsigned int sse_y, struct buf_2d yv12_mb[][MAX_MB_PLANE], int *rate,
+    int64_t *dist, int *flag_preduv_computed) {
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
   const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
@@ -815,6 +876,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   // Skipping threshold for dc.
   unsigned int thresh_dc;
   int motion_low = 1;
+  if (cpi->use_svc && ref_frame == GOLDEN_FRAME) return;
   if (mi->mv[0].as_mv.row > 64 || mi->mv[0].as_mv.row < -64 ||
       mi->mv[0].as_mv.col > 64 || mi->mv[0].as_mv.col < -64)
     motion_low = 0;
@@ -865,9 +927,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
       thresh_dc_uv = 0;
     }
 
-    // Skip UV prediction unless breakout is zero (lossless) to save
-    // computation with low impact on the result
-    if (x->encode_breakout == 0) {
+    if (!flag_preduv_computed[0] || !flag_preduv_computed[1]) {
       xd->plane[1].pre[0] = yv12_mb[ref_frame][1];
       xd->plane[2].pre[0] = yv12_mb[ref_frame][2];
       vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
@@ -944,7 +1004,7 @@ static void estimate_block_intra(int plane, int block, int row, int col,
     int64_t this_sse = INT64_MAX;
     // TODO(jingning): This needs further refactoring.
     block_yrd(cpi, x, &this_rdc, &args->skippable, &this_sse, bsize_tx,
-              VPXMIN(tx_size, TX_16X16));
+              VPXMIN(tx_size, TX_16X16), 0);
   } else {
     unsigned int var = 0;
     unsigned int sse = 0;
@@ -958,10 +1018,11 @@ static void estimate_block_intra(int plane, int block, int row, int col,
   args->rdc->dist += this_rdc.dist;
 }
 
-static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
+static const THR_MODES mode_idx[MAX_REF_FRAMES][4] = {
   { THR_DC, THR_V_PRED, THR_H_PRED, THR_TM },
   { THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV },
   { THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG },
+  { THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA },
 };
 
 static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED,
@@ -981,6 +1042,32 @@ static int mode_offset(const PREDICTION_MODE mode) {
   }
 }
 
+static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh,
+                                             const int *const thresh_fact) {
+  int is_rd_less_than_thresh;
+  is_rd_less_than_thresh =
+      best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX;
+  return is_rd_less_than_thresh;
+}
+
+static INLINE void update_thresh_freq_fact_row_mt(
+    VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance,
+    int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame,
+    THR_MODES best_mode_idx, PREDICTION_MODE mode) {
+  THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+  int freq_fact_idx = thresh_freq_fact_idx + thr_mode_idx;
+  int *freq_fact = &tile_data->row_base_thresh_freq_fact[freq_fact_idx];
+  if (thr_mode_idx == best_mode_idx)
+    *freq_fact -= (*freq_fact >> 4);
+  else if (cpi->sf.limit_newmv_early_exit && mode == NEWMV &&
+           ref_frame == LAST_FRAME && source_variance < 5) {
+    *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, 32);
+  } else {
+    *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC,
+                        cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+  }
+}
+
 static INLINE void update_thresh_freq_fact(
     VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance,
     BLOCK_SIZE bsize, MV_REFERENCE_FRAME ref_frame, THR_MODES best_mode_idx,
@@ -1080,40 +1167,31 @@ typedef struct {
   PREDICTION_MODE pred_mode;
 } REF_MODE;
 
-#define RT_INTER_MODES 8
+#define RT_INTER_MODES 12
 static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
   { LAST_FRAME, ZEROMV },   { LAST_FRAME, NEARESTMV },
   { GOLDEN_FRAME, ZEROMV }, { LAST_FRAME, NEARMV },
   { LAST_FRAME, NEWMV },    { GOLDEN_FRAME, NEARESTMV },
-  { GOLDEN_FRAME, NEARMV }, { GOLDEN_FRAME, NEWMV }
+  { GOLDEN_FRAME, NEARMV }, { GOLDEN_FRAME, NEWMV },
+  { ALTREF_FRAME, ZEROMV }, { ALTREF_FRAME, NEARESTMV },
+  { ALTREF_FRAME, NEARMV }, { ALTREF_FRAME, NEWMV }
 };
-static const REF_MODE ref_mode_set_svc[RT_INTER_MODES] = {
-  { LAST_FRAME, ZEROMV },      { GOLDEN_FRAME, ZEROMV },
-  { LAST_FRAME, NEARESTMV },   { LAST_FRAME, NEARMV },
+
+#define RT_INTER_MODES_SVC 8
+static const REF_MODE ref_mode_set_svc[RT_INTER_MODES_SVC] = {
+  { LAST_FRAME, ZEROMV },      { LAST_FRAME, NEARESTMV },
+  { LAST_FRAME, NEARMV },      { GOLDEN_FRAME, ZEROMV },
   { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV },
   { LAST_FRAME, NEWMV },       { GOLDEN_FRAME, NEWMV }
 };
 
-static int set_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize) {
-  const VP9_COMMON *const cm = &cpi->common;
-  // Reduce the intra cost penalty for small blocks (<=16x16).
-  int reduction_fac =
-      (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
-  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh)
-    // Don't reduce intra cost penalty if estimated noise level is high.
-    reduction_fac = 0;
-  return vp9_get_intra_cost_penalty(cm->base_qindex, cm->y_dc_delta_q,
-                                    cm->bit_depth) >>
-         reduction_fac;
-}
-
 static INLINE void find_predictors(
     VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
     int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask,
     const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col,
     struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize,
-    int force_skip_low_temp_var) {
+    int force_skip_low_temp_var, int comp_pred_allowed) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
@@ -1127,7 +1205,7 @@ static INLINE void find_predictors(
     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
     const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
     vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
-    if (cm->use_prev_frame_mvs) {
+    if (cm->use_prev_frame_mvs || comp_pred_allowed) {
       vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col,
                        x->mbmi_ext->mode_context);
     } else {
@@ -1153,7 +1231,8 @@ static INLINE void find_predictors(
 static void vp9_NEWMV_diff_bias(const NOISE_ESTIMATE *ne, MACROBLOCKD *xd,
                                 PREDICTION_MODE this_mode, RD_COST *this_rdc,
                                 BLOCK_SIZE bsize, int mv_row, int mv_col,
-                                int is_last_frame) {
+                                int is_last_frame, int lowvar_highsumdiff,
+                                int is_skin) {
   // Bias against MVs associated with NEWMV mode that are very different from
   // top/left neighbors.
   if (this_mode == NEWMV) {
@@ -1200,9 +1279,12 @@ static void vp9_NEWMV_diff_bias(const NOISE_ESTIMATE *ne, MACROBLOCKD *xd,
   // If noise estimation is enabled, and estimated level is above threshold,
   // add a bias to LAST reference with small motion, for large blocks.
   if (ne->enabled && ne->level >= kMedium && bsize >= BLOCK_32X32 &&
-      is_last_frame && mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8) {
-    this_rdc->rdcost = 7 * this_rdc->rdcost >> 3;
-  }
+      is_last_frame && mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8)
+    this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3);
+  else if (lowvar_highsumdiff && !is_skin && bsize >= BLOCK_16X16 &&
+           is_last_frame && mv_row < 16 && mv_row > -16 && mv_col < 16 &&
+           mv_col > -16)
+    this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3);
 }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
@@ -1237,6 +1319,7 @@ static void recheck_zeromv_after_denoising(
       ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) &&
       ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
        (ctx_den->best_ref_frame == GOLDEN_FRAME &&
+        cpi->svc.number_spatial_layers == 1 &&
         decision == FILTER_ZEROMV_BLOCK))) {
     // Check if we should pick ZEROMV on denoised signal.
     int rate = 0;
@@ -1257,16 +1340,17 @@ static void recheck_zeromv_after_denoising(
                                         [INTER_OFFSET(ZEROMV)];
     this_rdc.dist = dist;
     this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist);
-    // Switch to ZEROMV if the rdcost for ZEROMV on denoised source
-    // is lower than best_ref mode (on original source).
+    // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source
+    // is higher than best_ref mode (on original source).
     if (this_rdc.rdcost > best_rdc->rdcost) {
       this_rdc = *best_rdc;
       mi->mode = ctx_den->best_mode;
       mi->ref_frame[0] = ctx_den->best_ref_frame;
       mi->interp_filter = ctx_den->best_pred_filter;
-      if (ctx_den->best_ref_frame == INTRA_FRAME)
+      if (ctx_den->best_ref_frame == INTRA_FRAME) {
         mi->mv[0].as_int = INVALID_MV;
-      else if (ctx_den->best_ref_frame == GOLDEN_FRAME) {
+        mi->interp_filter = SWITCHABLE_FILTERS;
+      } else if (ctx_den->best_ref_frame == GOLDEN_FRAME) {
         mi->mv[0].as_int =
             ctx_den->frame_mv[ctx_den->best_mode][ctx_den->best_ref_frame]
                 .as_int;
@@ -1343,10 +1427,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   struct macroblockd_plane *const pd = &xd->plane[0];
   PREDICTION_MODE best_mode = ZEROMV;
   MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
-  MV_REFERENCE_FRAME usable_ref_frame;
+  MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame;
   TX_SIZE best_tx_size = TX_SIZES;
   INTERP_FILTER best_pred_filter = EIGHTTAP;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+  uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
@@ -1355,11 +1440,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
   unsigned int sse_y = UINT_MAX;
-  const int intra_cost_penalty = set_intra_cost_penalty(cpi, bsize);
+  const int intra_cost_penalty =
+      vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
   int64_t inter_mode_thresh =
       RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0);
   const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize];
-  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+  const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+  int thresh_freq_fact_idx = (sb_row * BLOCK_SIZES + bsize) * MAX_MODES;
+  const int *const rd_thresh_freq_fact =
+      (cpi->sf.adaptive_rd_thresh_row_mt)
+          ? &(tile_data->row_base_thresh_freq_fact[thresh_freq_fact_idx])
+          : tile_data->thresh_freq_fact[bsize];
+
   INTERP_FILTER filter_ref;
   const int bsl = mi_width_log2_lookup[bsize];
   const int pred_filter_search =
@@ -1393,13 +1485,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   int perform_intra_pred = 1;
   int use_golden_nonzeromv = 1;
   int force_skip_low_temp_var = 0;
+  int skip_ref_find_pred[4] = { 0 };
+  unsigned int sse_zeromv_normalized = UINT_MAX;
+  unsigned int best_sse_sofar = UINT_MAX;
+  unsigned int thresh_svc_skip_golden = 500;
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_PICKMODE_CTX_DEN ctx_den;
   int64_t zero_last_cost_orig = INT64_MAX;
+  int denoise_svc_pickmode = 1;
 #endif
+  INTERP_FILTER filter_gf_svc = EIGHTTAP;
+  MV_REFERENCE_FRAME best_second_ref_frame = NONE;
+  int comp_modes = 0;
+  int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES;
+  int flag_svc_subpel = 0;
+  int svc_mv_col = 0;
+  int svc_mv_row = 0;
 
   init_ref_frame_cost(cm, xd, ref_frame_cost);
 
+  memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES);
+
   if (reuse_inter_pred) {
     int i;
     for (i = 0; i < 3; i++) {
@@ -1455,18 +1561,43 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 &&
-      cpi->denoiser.denoising_level > kDenLowLow) {
-    vp9_denoiser_reset_frame_stats(ctx);
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    if (cpi->use_svc) {
+      int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
+                                   cpi->svc.temporal_layer_id,
+                                   cpi->svc.number_temporal_layers);
+      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+      denoise_svc_pickmode = denoise_svc(cpi) && !lc->is_key_frame;
+    }
+    if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode)
+      vp9_denoiser_reset_frame_stats(ctx);
   }
 #endif
 
-  if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc) {
+  if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc &&
+      !cpi->rc.alt_ref_gf_group && !cpi->rc.last_frame_is_src_altref) {
     usable_ref_frame = LAST_FRAME;
   } else {
     usable_ref_frame = GOLDEN_FRAME;
   }
 
+  if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) {
+    if (cpi->rc.alt_ref_gf_group || cpi->rc.is_src_frame_alt_ref)
+      usable_ref_frame = ALTREF_FRAME;
+
+    if (cpi->rc.is_src_frame_alt_ref) {
+      skip_ref_find_pred[LAST_FRAME] = 1;
+      skip_ref_find_pred[GOLDEN_FRAME] = 1;
+    }
+    if (!cm->show_frame) {
+      if (cpi->rc.frames_since_key == 1) {
+        usable_ref_frame = LAST_FRAME;
+        skip_ref_find_pred[GOLDEN_FRAME] = 1;
+        skip_ref_find_pred[ALTREF_FRAME] = 1;
+      }
+    }
+  }
+
   // For svc mode, on spatial_layer_id > 0: if the reference has different scale
   // constrain the inter mode to only test zero motion.
   if (cpi->use_svc && svc->force_zero_mode_spatial_ref &&
@@ -1484,19 +1615,54 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   if (cpi->sf.short_circuit_low_temp_var) {
     force_skip_low_temp_var =
         get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
+    // If force_skip_low_temp_var is set, and for short circuit mode = 1 and 3,
+    // skip golden reference.
+    if ((cpi->sf.short_circuit_low_temp_var == 1 ||
+         cpi->sf.short_circuit_low_temp_var == 3) &&
+        force_skip_low_temp_var) {
+      usable_ref_frame = LAST_FRAME;
+    }
   }
 
   if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
         !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var))
     use_golden_nonzeromv = 0;
 
+  if (cpi->oxcf.speed >= 8 && !cpi->use_svc &&
+      ((cpi->rc.frames_since_golden + 1) < x->last_sb_high_content ||
+       x->last_sb_high_content > 40 || cpi->rc.frames_since_golden > 120))
+    usable_ref_frame = LAST_FRAME;
+
+  // Compound prediction modes: (0,0) on LAST/GOLDEN and ARF.
+  if (cm->reference_mode == REFERENCE_MODE_SELECT &&
+      cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME)
+    comp_modes = 2;
+
   for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
-    find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
-                    &ref_frame_skip_mask, flag_list, tile_data, mi_row, mi_col,
-                    yv12_mb, bsize, force_skip_low_temp_var);
+    if (!skip_ref_find_pred[ref_frame]) {
+      find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
+                      &ref_frame_skip_mask, flag_list, tile_data, mi_row,
+                      mi_col, yv12_mb, bsize, force_skip_low_temp_var,
+                      comp_modes > 0);
+    }
   }
 
-  for (idx = 0; idx < RT_INTER_MODES; ++idx) {
+  if (cpi->use_svc || cpi->oxcf.speed <= 7 || bsize < BLOCK_32X32)
+    x->sb_use_mv_part = 0;
+
+  // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used
+  // an averaging filter for downsampling (phase = 8). If so, we will test
+  // a nonzero motion mode on the spatial (goldeen) reference.
+  // The nonzero motion is half pixel shifted to left and top (-4, -4).
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
+      svc_force_zero_mode[GOLDEN_FRAME - 1] &&
+      cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id - 1] == 8) {
+    svc_mv_col = -4;
+    svc_mv_row = -4;
+    flag_svc_subpel = 1;
+  }
+
+  for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) {
     int rate_mv = 0;
     int mode_rd_thresh;
     int mode_index;
@@ -1504,9 +1670,63 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     int64_t this_sse;
     int is_skippable;
     int this_early_term = 0;
-    PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+    int rd_computed = 0;
+    int flag_preduv_computed[2] = { 0 };
+    int inter_mv_mode = 0;
+    int skip_this_mv = 0;
+    int comp_pred = 0;
+    int force_gf_mv = 0;
+    PREDICTION_MODE this_mode;
+    second_ref_frame = NONE;
 
-    if (cpi->use_svc) this_mode = ref_mode_set_svc[idx].pred_mode;
+    if (idx < num_inter_modes) {
+      this_mode = ref_mode_set[idx].pred_mode;
+      ref_frame = ref_mode_set[idx].ref_frame;
+
+      if (cpi->use_svc) {
+        this_mode = ref_mode_set_svc[idx].pred_mode;
+        ref_frame = ref_mode_set_svc[idx].ref_frame;
+      }
+    } else {
+      // Add (0,0) compound modes.
+      this_mode = ZEROMV;
+      ref_frame = LAST_FRAME;
+      if (idx == num_inter_modes + comp_modes - 1) ref_frame = GOLDEN_FRAME;
+      second_ref_frame = ALTREF_FRAME;
+      comp_pred = 1;
+    }
+
+    if (ref_frame > usable_ref_frame) continue;
+    if (skip_ref_find_pred[ref_frame]) continue;
+
+    if (flag_svc_subpel && ref_frame == GOLDEN_FRAME) {
+      force_gf_mv = 1;
+      // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
+      // otherwise set NEWMV to (svc_mv_col, svc_mv_row).
+      if (this_mode == NEWMV) {
+        frame_mv[this_mode][ref_frame].as_mv.col = svc_mv_col;
+        frame_mv[this_mode][ref_frame].as_mv.row = svc_mv_row;
+      } else if (frame_mv[this_mode][ref_frame].as_mv.col != svc_mv_col ||
+                 frame_mv[this_mode][ref_frame].as_mv.row != svc_mv_row) {
+        continue;
+      }
+    }
+
+    if (comp_pred) {
+      const struct segmentation *const seg = &cm->seg;
+      if (!cpi->allow_comp_inter_inter) continue;
+      // Skip compound inter modes if ARF is not available.
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      // Do not allow compound prediction if the segment level reference frame
+      // feature is in use as in this case there can only be one reference.
+      if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue;
+    }
+
+    // For SVC, skip the golden (spatial) reference search if sse of zeromv_last
+    // is below threshold.
+    if (cpi->use_svc && ref_frame == GOLDEN_FRAME &&
+        sse_zeromv_normalized < thresh_svc_skip_golden)
+      continue;
 
     if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
         this_mode != NEARESTMV) {
@@ -1515,9 +1735,28 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 
     if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) continue;
 
-    ref_frame = ref_mode_set[idx].ref_frame;
-    if (cpi->use_svc) {
-      ref_frame = ref_mode_set_svc[idx].ref_frame;
+    if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) {
+      if (cpi->rc.is_src_frame_alt_ref &&
+          (ref_frame != ALTREF_FRAME ||
+           frame_mv[this_mode][ref_frame].as_int != 0))
+        continue;
+
+      if (!cm->show_frame && ref_frame == ALTREF_FRAME &&
+          frame_mv[this_mode][ref_frame].as_int != 0)
+        continue;
+
+      if (cpi->rc.alt_ref_gf_group && cm->show_frame &&
+          cpi->rc.frames_since_golden > (cpi->rc.baseline_gf_interval >> 1) &&
+          ref_frame == GOLDEN_FRAME &&
+          frame_mv[this_mode][ref_frame].as_int != 0)
+        continue;
+
+      if (cpi->rc.alt_ref_gf_group && cm->show_frame &&
+          cpi->rc.frames_since_golden > 0 &&
+          cpi->rc.frames_since_golden < (cpi->rc.baseline_gf_interval >> 1) &&
+          ref_frame == ALTREF_FRAME &&
+          frame_mv[this_mode][ref_frame].as_int != 0)
+        continue;
     }
 
     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
@@ -1527,38 +1766,58 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var
     // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
     // later.
-    if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
+    if (!force_gf_mv && force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
         frame_mv[this_mode][ref_frame].as_int != 0) {
       continue;
     }
 
-    if (cpi->sf.short_circuit_low_temp_var == 2 && force_skip_low_temp_var &&
-        ref_frame == LAST_FRAME && this_mode == NEWMV) {
+    if (x->content_state_sb != kVeryHighSad &&
+        (cpi->sf.short_circuit_low_temp_var >= 2 ||
+         (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64)) &&
+        force_skip_low_temp_var && ref_frame == LAST_FRAME &&
+        this_mode == NEWMV) {
       continue;
     }
 
     if (cpi->use_svc) {
-      if (svc_force_zero_mode[ref_frame - 1] &&
+      if (!force_gf_mv && svc_force_zero_mode[ref_frame - 1] &&
           frame_mv[this_mode][ref_frame].as_int != 0)
         continue;
     }
 
-    if (!force_skip_low_temp_var &&
+    if (sf->reference_masking &&
         !(frame_mv[this_mode][ref_frame].as_int == 0 &&
           ref_frame == LAST_FRAME)) {
-      i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
-      if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking)
-        if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+      if (usable_ref_frame < ALTREF_FRAME) {
+        if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
+          i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+          if ((cpi->ref_frame_flags & flag_list[i]))
+            if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+              ref_frame_skip_mask |= (1 << ref_frame);
+        }
+      } else if (!cpi->rc.is_src_frame_alt_ref &&
+                 !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+                   ref_frame == ALTREF_FRAME)) {
+        int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
+        int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
+        if (((cpi->ref_frame_flags & flag_list[ref1]) &&
+             (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
+            ((cpi->ref_frame_flags & flag_list[ref2]) &&
+             (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
           ref_frame_skip_mask |= (1 << ref_frame);
+      }
     }
     if (ref_frame_skip_mask & (1 << ref_frame)) continue;
 
     // Select prediction reference frames.
-    for (i = 0; i < MAX_MB_PLANE; i++)
+    for (i = 0; i < MAX_MB_PLANE; i++) {
       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
 
     mi->ref_frame[0] = ref_frame;
-    set_ref_ptrs(cm, xd, ref_frame, NONE);
+    mi->ref_frame[1] = second_ref_frame;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
     mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
     mode_rd_thresh = best_mode_skip_txfm ? rd_threshes[mode_index] << 1
@@ -1570,16 +1829,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         cpi->rc.frames_since_golden > 4)
       mode_rd_thresh = mode_rd_thresh << 3;
 
-    if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
-                            rd_thresh_freq_fact[mode_index]))
+    if ((cpi->sf.adaptive_rd_thresh_row_mt &&
+         rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh,
+                                    &rd_thresh_freq_fact[mode_index])) ||
+        (!cpi->sf.adaptive_rd_thresh_row_mt &&
+         rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                             &rd_thresh_freq_fact[mode_index])))
       continue;
 
-    if (this_mode == NEWMV) {
+    if (this_mode == NEWMV && !force_gf_mv) {
       if (ref_frame > LAST_FRAME && !cpi->use_svc &&
           cpi->oxcf.rc_mode == VPX_CBR) {
         int tmp_sad;
         uint32_t dis;
-        int cost_list[5];
+        int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
 
         if (bsize < BLOCK_16X16) continue;
 
@@ -1607,17 +1870,37 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       } else if (svc->use_base_mv && svc->spatial_layer_id) {
         if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) {
           const int pre_stride = xd->plane[0].pre[0].stride;
-          int base_mv_sad = INT_MAX;
-          const float base_mv_bias = sf->base_mv_aggressive ? 1.5f : 1.0f;
+          unsigned int base_mv_sse = UINT_MAX;
+          int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4;
           const uint8_t *const pre_buf =
               xd->plane[0].pre[0].buf +
               (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride +
               (frame_mv[NEWMV][ref_frame].as_mv.col >> 3);
-          base_mv_sad = cpi->fn_ptr[bsize].sdf(
-              x->plane[0].src.buf, x->plane[0].src.stride, pre_buf, pre_stride);
+          cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                pre_buf, pre_stride, &base_mv_sse);
 
-          if (base_mv_sad < (int)(base_mv_bias * x->pred_mv_sad[ref_frame])) {
+          // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16,
+          // for SVC encoding.
+          if (cpi->use_svc && cpi->svc.use_base_mv && bsize < BLOCK_16X16 &&
+              frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+              frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+            continue;
+
+          // Exit NEWMV search if base_mv_sse is large.
+          if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale))
+            continue;
+          if (base_mv_sse < (best_sse_sofar << 1)) {
             // Base layer mv is good.
+            // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since
+            // (0, 0) mode is already tested.
+            unsigned int base_mv_sse_normalized =
+                base_mv_sse >>
+                (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+            if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar &&
+                base_mv_sse_normalized < 400 &&
+                frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+                frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+              continue;
             if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
                                         &frame_mv[NEWMV][ref_frame], &rate_mv,
                                         best_rdc.rdcost, 1)) {
@@ -1640,6 +1923,22 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       }
     }
 
+    // TODO(jianj): Skipping the testing of (duplicate) non-zero motion vector
+    // causes some regression, leave it for duplicate zero-mv for now, until
+    // regression issue is resolved.
+    for (inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV; inter_mv_mode++) {
+      if (inter_mv_mode == this_mode || comp_pred) continue;
+      if (mode_checked[inter_mv_mode][ref_frame] &&
+          frame_mv[this_mode][ref_frame].as_int ==
+              frame_mv[inter_mv_mode][ref_frame].as_int &&
+          frame_mv[inter_mv_mode][ref_frame].as_int == 0) {
+        skip_this_mv = 1;
+        break;
+      }
+    }
+
+    if (skip_this_mv) continue;
+
     // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no
     // need to compute best_pred_sad which is only used to skip golden NEWMV.
     if (use_golden_nonzeromv && this_mode == NEWMV && ref_frame == LAST_FRAME &&
@@ -1654,13 +1953,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       x->pred_mv_sad[LAST_FRAME] = best_pred_sad;
     }
 
-    if (this_mode != NEARESTMV &&
+    if (this_mode != NEARESTMV && !comp_pred &&
         frame_mv[this_mode][ref_frame].as_int ==
             frame_mv[NEARESTMV][ref_frame].as_int)
       continue;
 
     mi->mode = this_mode;
     mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+    mi->mv[1].as_int = 0;
 
     // Search for the best prediction filter type, when the resulting
     // motion vector is at sub-pixel accuracy level for luma component, i.e.,
@@ -1678,17 +1978,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
         pred_filter_search &&
         (ref_frame == LAST_FRAME ||
-         (ref_frame == GOLDEN_FRAME &&
+         (ref_frame == GOLDEN_FRAME && !force_gf_mv &&
           (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) &&
         (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
       int pf_rate[3];
       int64_t pf_dist[3];
+      int curr_rate[3];
       unsigned int pf_var[3];
       unsigned int pf_sse[3];
       TX_SIZE pf_tx_size[3];
       int64_t best_cost = INT64_MAX;
       INTERP_FILTER best_filter = SWITCHABLE, filter;
       PRED_BUFFER *current_pred = this_mode_pred;
+      rd_computed = 1;
 
       for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) {
         int64_t cost;
@@ -1696,6 +1998,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
         model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
                           &pf_var[filter], &pf_sse[filter]);
+        curr_rate[filter] = pf_rate[filter];
         pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
         cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
         pf_tx_size[filter] = mi->tx_size;
@@ -1721,7 +2024,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 
       mi->interp_filter = best_filter;
       mi->tx_size = pf_tx_size[best_filter];
-      this_rdc.rate = pf_rate[best_filter];
+      this_rdc.rate = curr_rate[best_filter];
       this_rdc.dist = pf_dist[best_filter];
       var_y = pf_var[best_filter];
       sse_y = pf_sse[best_filter];
@@ -1731,16 +2034,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         pd->dst.stride = this_mode_pred->stride;
       }
     } else {
-// TODO(jackychen): the low-bitdepth condition causes a segfault in
-// high-bitdepth builds.
-// https://bugs.chromium.org/p/webm/issues/detail?id=1250
-#if CONFIG_VP9_HIGHBITDEPTH
-      const int large_block = bsize > BLOCK_32X32;
-#else
+      // For low motion content use x->sb_is_skin in addition to VeryHighSad
+      // for setting large_block.
       const int large_block =
-          x->sb_is_skin ? bsize > BLOCK_32X32 : bsize >= BLOCK_32X32;
-#endif
+          (x->content_state_sb == kVeryHighSad ||
+           (x->sb_is_skin && cpi->rc.avg_frame_low_motion > 70) ||
+           cpi->oxcf.speed < 7)
+              ? bsize > BLOCK_32X32
+              : bsize >= BLOCK_32X32;
       mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
+
+      if (cpi->use_svc && ref_frame == GOLDEN_FRAME &&
+          svc_force_zero_mode[ref_frame - 1])
+        mi->interp_filter = filter_gf_svc;
+
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
 
       // For large partition blocks, extra testing is done.
@@ -1749,17 +2056,26 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
           cm->base_qindex) {
         model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
                                 &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col,
-                                &this_early_term);
+                                &this_early_term, flag_preduv_computed);
       } else {
+        rd_computed = 1;
         model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
                           &var_y, &sse_y);
       }
+      // Save normalized sse (between current and last frame) for (0, 0) motion.
+      if (cpi->use_svc && ref_frame == LAST_FRAME &&
+          frame_mv[this_mode][ref_frame].as_int == 0) {
+        sse_zeromv_normalized =
+            sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+      }
+      if (sse_y < best_sse_sofar) best_sse_sofar = sse_y;
     }
 
     if (!this_early_term) {
       this_sse = (int64_t)sse_y;
       block_yrd(cpi, x, &this_rdc, &is_skippable, &this_sse, bsize,
-                VPXMIN(mi->tx_size, TX_16X16));
+                VPXMIN(mi->tx_size, TX_16X16), rd_computed);
+
       x->skip_txfm[0] = is_skippable;
       if (is_skippable) {
         this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
@@ -1785,13 +2101,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
     }
 
-    if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+    if (!this_early_term &&
+        (x->color_sensitivity[0] || x->color_sensitivity[1])) {
       RD_COST rdc_uv;
       const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
-      if (x->color_sensitivity[0])
+      if (x->color_sensitivity[0] && !flag_preduv_computed[0]) {
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
-      if (x->color_sensitivity[1])
+        flag_preduv_computed[0] = 1;
+      }
+      if (x->color_sensitivity[1] && !flag_preduv_computed[1]) {
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
+        flag_preduv_computed[1] = 1;
+      }
       model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2);
       this_rdc.rate += rdc_uv.rate;
       this_rdc.dist += rdc_uv.dist;
@@ -1800,6 +2121,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     this_rdc.rate += rate_mv;
     this_rdc.rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]]
                                          [INTER_OFFSET(this_mode)];
+    // TODO(marpan): Add costing for compound mode.
     this_rdc.rate += ref_frame_cost[ref_frame];
     this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
 
@@ -1810,7 +2132,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       vp9_NEWMV_diff_bias(&cpi->noise_estimate, xd, this_mode, &this_rdc, bsize,
                           frame_mv[this_mode][ref_frame].as_mv.row,
                           frame_mv[this_mode][ref_frame].as_mv.col,
-                          ref_frame == LAST_FRAME);
+                          ref_frame == LAST_FRAME, x->lowvar_highsumdiff,
+                          x->sb_is_skin);
     }
 
     // Skipping checking: test to see if this block can be reconstructed by
@@ -1818,7 +2141,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if (cpi->allow_encode_breakout) {
       encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode,
                            var_y, sse_y, yv12_mb, &this_rdc.rate,
-                           &this_rdc.dist);
+                           &this_rdc.dist, flag_preduv_computed);
       if (x->skip) {
         this_rdc.rate += rate_mv;
         this_rdc.rdcost =
@@ -1827,7 +2150,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0 &&
+    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode &&
         cpi->denoiser.denoising_level > kDenLowLow) {
       vp9_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx);
       // Keep track of zero_last cost.
@@ -1838,6 +2161,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     (void)ctx;
 #endif
 
+    mode_checked[this_mode][ref_frame] = 1;
+
     if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
       best_rdc = this_rdc;
       best_mode = this_mode;
@@ -1846,6 +2171,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       best_ref_frame = ref_frame;
       best_mode_skip_txfm = x->skip_txfm[0];
       best_early_term = this_early_term;
+      best_second_ref_frame = second_ref_frame;
 
       if (reuse_inter_pred) {
         free_pred_buffer(best_pred);
@@ -1872,6 +2198,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   mi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
   xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int;
   x->skip_txfm[0] = best_mode_skip_txfm;
+  mi->ref_frame[1] = best_second_ref_frame;
 
   // For spatial enhancemanent layer: perform intra prediction only if base
   // layer is chosen as the reference. Always perform intra prediction if
@@ -1884,12 +2211,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
          svc_force_zero_mode[best_ref_frame - 1]);
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
   }
+  if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+      cpi->rc.is_src_frame_alt_ref)
+    perform_intra_pred = 0;
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
-  if ((!force_skip_low_temp_var || bsize < BLOCK_32X32) && perform_intra_pred &&
-      (best_rdc.rdcost == INT64_MAX ||
-       (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
-        bsize <= cpi->sf.max_intra_bsize))) {
+  if (best_rdc.rdcost == INT64_MAX ||
+      ((!force_skip_low_temp_var || bsize < BLOCK_32X32 ||
+        x->content_state_sb == kVeryHighSad) &&
+       perform_intra_pred && !x->skip && best_rdc.rdcost > inter_mode_thresh &&
+       bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad &&
+       !x->lowvar_highsumdiff)) {
     struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
     int i;
     TX_SIZE best_intra_tx_size = TX_SIZES;
@@ -1904,17 +2236,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
 #if CONFIG_VP9_HIGHBITDEPTH
         if (cm->use_highbitdepth)
-          vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
-                                   this_mode_pred->data, this_mode_pred->stride,
-                                   NULL, 0, NULL, 0, bw, bh, xd->bd);
+          vpx_highbd_convolve_copy(
+              CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
+              CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride,
+              NULL, 0, 0, 0, 0, bw, bh, xd->bd);
         else
           vpx_convolve_copy(best_pred->data, best_pred->stride,
                             this_mode_pred->data, this_mode_pred->stride, NULL,
-                            0, NULL, 0, bw, bh);
+                            0, 0, 0, 0, bw, bh);
 #else
         vpx_convolve_copy(best_pred->data, best_pred->stride,
                           this_mode_pred->data, this_mode_pred->stride, NULL, 0,
-                          NULL, 0, bw, bh);
+                          0, 0, 0, bw, bh);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         best_pred = this_mode_pred;
       }
@@ -1933,8 +2266,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize]))
         continue;
 
-      if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
-                              rd_thresh_freq_fact[mode_index]))
+      if ((cpi->sf.adaptive_rd_thresh_row_mt &&
+           rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh,
+                                      &rd_thresh_freq_fact[mode_index])) ||
+          (!cpi->sf.adaptive_rd_thresh_row_mt &&
+           rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                               &rd_thresh_freq_fact[mode_index])))
         continue;
 
       mi->mode = this_mode;
@@ -1975,8 +2312,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         best_mode = this_mode;
         best_intra_tx_size = mi->tx_size;
         best_ref_frame = INTRA_FRAME;
+        best_second_ref_frame = NONE;
         mi->uv_mode = this_mode;
         mi->mv[0].as_int = INVALID_MV;
+        mi->mv[1].as_int = INVALID_MV;
         best_mode_skip_txfm = x->skip_txfm[0];
       }
     }
@@ -1992,6 +2331,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   pd->dst = orig_dst;
   mi->mode = best_mode;
   mi->ref_frame[0] = best_ref_frame;
+  mi->ref_frame[1] = best_second_ref_frame;
   x->skip_txfm[0] = best_mode_skip_txfm;
 
   if (!is_inter_block(mi)) {
@@ -2002,23 +2342,30 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
 #if CONFIG_VP9_HIGHBITDEPTH
       if (cm->use_highbitdepth)
-        vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
-                                 pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0,
-                                 bw, bh, xd->bd);
+        vpx_highbd_convolve_copy(
+            CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
+            CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, 0, 0, 0,
+            bw, bh, xd->bd);
       else
         vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
-                          pd->dst.stride, NULL, 0, NULL, 0, bw, bh);
+                          pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);
 #else
       vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
-                        pd->dst.stride, NULL, 0, NULL, 0, bw, bh);
+                        pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     }
   }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0 && cpi->resize_pending == 0 &&
-      cpi->denoiser.denoising_level > kDenLowLow && cpi->denoiser.reset == 0) {
+      denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow &&
+      cpi->denoiser.reset == 0) {
     VP9_DENOISER_DECISION decision = COPY_BLOCK;
+    ctx->sb_skip_denoising = 0;
+    // TODO(marpan): There is an issue with denoising when the
+    // superblock partitioning scheme is based on the pickmode.
+    // Remove this condition when the issue is resolved.
+    if (x->sb_pickmode_part) ctx->sb_skip_denoising = 1;
     vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost,
                                 frame_mv, reuse_inter_pred, best_tx_size,
                                 best_mode, best_ref_frame, best_pred_filter,
@@ -2030,6 +2377,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   }
 #endif
 
+  if (best_ref_frame == ALTREF_FRAME || best_second_ref_frame == ALTREF_FRAME)
+    x->arf_frame_usage++;
+  else if (best_ref_frame != INTRA_FRAME)
+    x->lastgolden_frame_usage++;
+
   if (cpi->sf.adaptive_rd_thresh) {
     THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mi->mode)];
 
@@ -2041,16 +2393,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       // TODO(yunqingwang): Check intra mode mask and only update freq_fact
       // for those valid modes.
       for (i = 0; i < intra_modes; i++) {
-        update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
-                                INTRA_FRAME, best_mode_idx, intra_mode_list[i]);
+        if (cpi->sf.adaptive_rd_thresh_row_mt)
+          update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance,
+                                         thresh_freq_fact_idx, INTRA_FRAME,
+                                         best_mode_idx, intra_mode_list[i]);
+        else
+          update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
+                                  INTRA_FRAME, best_mode_idx,
+                                  intra_mode_list[i]);
       }
     } else {
       for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
         PREDICTION_MODE this_mode;
         if (best_ref_frame != ref_frame) continue;
         for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
-          update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
-                                  ref_frame, best_mode_idx, this_mode);
+          if (cpi->sf.adaptive_rd_thresh_row_mt)
+            update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance,
+                                           thresh_freq_fact_idx, ref_frame,
+                                           best_mode_idx, this_mode);
+          else
+            update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
+                                    ref_frame, best_mode_idx, this_mode);
         }
       }
     }
@@ -2197,12 +2560,12 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
             }
 
             vp9_set_mv_search_range(&x->mv_limits,
-                                    &mbmi_ext->ref_mvs[0]->as_mv);
+                                    &mbmi_ext->ref_mvs[ref_frame][0].as_mv);
 
-            vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
-                                  x->sadperbit4, cond_cost_list(cpi, cost_list),
-                                  &mbmi_ext->ref_mvs[ref_frame][0].as_mv,
-                                  &tmp_mv, INT_MAX, 0);
+            vp9_full_pixel_search(
+                cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method,
+                x->sadperbit4, cond_cost_list(cpi, cost_list),
+                &mbmi_ext->ref_mvs[ref_frame][0].as_mv, &tmp_mv, INT_MAX, 0);
 
             x->mv_limits = tmp_mv_limits;
 
@@ -2235,7 +2598,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
 #if CONFIG_VP9_HIGHBITDEPTH
           if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
             vp9_highbd_build_inter_predictor(
-                pd->pre[0].buf, pd->pre[0].stride, pd->dst.buf, pd->dst.stride,
+                CONVERT_TO_SHORTPTR(pd->pre[0].buf), pd->pre[0].stride,
+                CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride,
                 &xd->mi[0]->bmi[i].as_mv[0].as_mv, &xd->block_refs[0]->sf,
                 4 * num_4x4_blocks_wide, 4 * num_4x4_blocks_high, 0,
                 vp9_filter_kernels[mi->interp_filter], MV_PRECISION_Q3,
diff --git a/libs/libvpx/vp9/encoder/vp9_quantize.c b/libs/libvpx/vp9/encoder/vp9_quantize.c
index de96c6e068..09f61ead26 100644
--- a/libs/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libs/libvpx/vp9/encoder/vp9_quantize.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <math.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
@@ -21,77 +22,67 @@
 #include "vp9/encoder/vp9_rd.h"
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                       int skip_block, const int16_t *zbin_ptr,
-                       const int16_t *round_ptr, const int16_t *quant_ptr,
-                       const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                       int skip_block, const int16_t *round_ptr,
+                       const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                        uint16_t *eob_ptr, const int16_t *scan,
                        const int16_t *iscan) {
   int i, eob = -1;
-  // TODO(jingning) Decide the need of these arguments after the
-  // quantization process is completed.
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+    int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant_ptr[rc != 0]) >> 16;
 
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
 
-      if (tmp) eob = i;
-    }
+    if (tmp) eob = i;
   }
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
-                              int skip_block, const int16_t *zbin_ptr,
-                              const int16_t *round_ptr,
-                              const int16_t *quant_ptr,
-                              const int16_t *quant_shift_ptr,
-                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t *round_ptr,
+                              const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                              tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
                               const int16_t *scan, const int16_t *iscan) {
   int i;
   int eob = -1;
-  // TODO(jingning) Decide the need of these arguments after the
-  // quantization process is completed.
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
+
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
-  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp = abs_coeff + round_ptr[rc != 0];
-      const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16);
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-      if (abs_qcoeff) eob = i;
-    }
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+    const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16);
+    qcoeff_ptr[rc] = (tran_low_t)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+    if (abs_qcoeff) eob = i;
   }
   *eob_ptr = eob + 1;
 }
@@ -100,38 +91,35 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
 // TODO(jingning) Refactor this file and combine functions with similar
 // operations.
 void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             int skip_block, const int16_t *round_ptr,
+                             const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                             tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      int tmp = 0;
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    int tmp = 0;
+    int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
-        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-        tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
-        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      }
-
-      if (tmp) eob = i;
+    if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     }
+
+    if (tmp) eob = i;
   }
   *eob_ptr = eob + 1;
 }
@@ -139,36 +127,33 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_quantize_fp_32x32_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
+
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    for (i = 0; i < n_coeffs; i++) {
-      uint32_t abs_qcoeff = 0;
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  for (i = 0; i < n_coeffs; i++) {
+    int abs_qcoeff = 0;
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
-        const int64_t tmp =
-            abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-        abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 15);
-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      }
-
-      if (abs_qcoeff) eob = i;
+    if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+      const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 15);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     }
+
+    if (abs_qcoeff) eob = i;
   }
   *eob_ptr = eob + 1;
 }
@@ -179,22 +164,28 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block),
+             *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const int n_coeffs = 4 * 4;
+
+  if (x->skip_block) {
+    memset(qcoeff, 0, n_coeffs * sizeof(*qcoeff));
+    memset(dqcoeff, 0, n_coeffs * sizeof(*dqcoeff));
+    return;
+  }
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block,
-                          p->zbin, p->round, p->quant, p->quant_shift,
-                          BLOCK_OFFSET(p->qcoeff, block),
-                          BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant,
+    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs,
+                          x->skip_block, p->zbin, p->round, p->quant,
+                          p->quant_shift, qcoeff, dqcoeff, pd->dequant,
                           &p->eobs[block], scan, iscan);
     return;
   }
 #endif
-  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block, p->zbin,
-                 p->round, p->quant, p->quant_shift,
-                 BLOCK_OFFSET(p->qcoeff, block),
-                 BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant, &p->eobs[block],
-                 scan, iscan);
+  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, x->skip_block,
+                 p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff,
+                 pd->dequant, &p->eobs[block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
diff --git a/libs/libvpx/vp9/encoder/vp9_ratectrl.c b/libs/libvpx/vp9/encoder/vp9_ratectrl.c
index 93eddd655a..b7f3a0e897 100644
--- a/libs/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libs/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -15,6 +15,7 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -43,8 +44,6 @@
 #define MIN_BPB_FACTOR 0.005
 #define MAX_BPB_FACTOR 50
 
-#define FRAME_OVERHEAD_BITS 200
-
 #if CONFIG_VP9_HIGHBITDEPTH
 #define ASSIGN_MINQ_TABLE(bit_depth, name)                   \
   do {                                                       \
@@ -90,10 +89,17 @@ static int inter_minq_12[QINDEX_RANGE];
 static int rtc_minq_12[QINDEX_RANGE];
 #endif
 
+#ifdef AGGRESSIVE_VBR
+static int gf_high = 2400;
+static int gf_low = 400;
+static int kf_high = 4000;
+static int kf_low = 400;
+#else
 static int gf_high = 2000;
 static int gf_low = 400;
 static int kf_high = 5000;
 static int kf_low = 400;
+#endif
 
 // Functions to compute the active minq lookup table entries based on a
 // formulaic approach to facilitate easier adjustment of the Q tables.
@@ -123,9 +129,14 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
     const double maxq = vp9_convert_qindex_to_q(i, bit_depth);
     kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
     kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+#ifdef AGGRESSIVE_VBR
+    arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.275, bit_depth);
+    inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.80, bit_depth);
+#else
     arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
-    arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
     inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
+#endif
+    arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
     rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
   }
 }
@@ -163,6 +174,17 @@ double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) {
 #endif
 }
 
+int vp9_convert_q_to_qindex(double q_val, vpx_bit_depth_t bit_depth) {
+  int i;
+
+  for (i = 0; i < QINDEX_RANGE; ++i)
+    if (vp9_convert_qindex_to_q(i, bit_depth) >= q_val) break;
+
+  if (i == QINDEX_RANGE) i--;
+
+  return i;
+}
+
 int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
                        double correction_factor, vpx_bit_depth_t bit_depth) {
   const double q = vp9_convert_qindex_to_q(qindex, bit_depth);
@@ -182,24 +204,29 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
   const int bpm =
       (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
   return VPXMAX(FRAME_OVERHEAD_BITS,
-                (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+                (int)(((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS));
 }
 
 int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
   const RATE_CONTROL *rc = &cpi->rc;
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
-  const int min_frame_target =
-      VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
-  if (target < min_frame_target) target = min_frame_target;
-  if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
-    // If there is an active ARF at this location use the minimum
-    // bits on this frame even if it is a constructed arf.
-    // The active maximum quantizer insures that an appropriate
-    // number of bits will be spent if needed for constructed ARFs.
-    target = min_frame_target;
+
+  if (cpi->oxcf.pass != 2) {
+    const int min_frame_target =
+        VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
+    if (target < min_frame_target) target = min_frame_target;
+    if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
+      // If there is an active ARF at this location use the minimum
+      // bits on this frame even if it is a constructed arf.
+      // The active maximum quantizer insures that an appropriate
+      // number of bits will be spent if needed for constructed ARFs.
+      target = min_frame_target;
+    }
   }
+
   // Clip the frame target to the maximum allowed value.
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+
   if (oxcf->rc_max_inter_bitrate_pct) {
     const int max_rate =
         rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
@@ -326,7 +353,10 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->af_ratio_onepass_vbr = 10;
   rc->prev_avg_source_sad_lag = 0;
   rc->high_source_sad = 0;
+  rc->reset_high_source_sad = 0;
   rc->high_source_sad_lagindex = -1;
+  rc->alt_ref_gf_group = 0;
+  rc->last_frame_is_src_altref = 0;
   rc->fac_active_worst_inter = 150;
   rc->fac_active_worst_gf = 100;
   rc->force_qpmin = 0;
@@ -410,7 +440,7 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) {
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
         !rc->is_src_frame_alt_ref && !cpi->use_svc &&
-        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 100))
       rcf = rc->rate_correction_factors[GF_ARF_STD];
     else
       rcf = rc->rate_correction_factors[INTER_NORMAL];
@@ -436,7 +466,7 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
         !rc->is_src_frame_alt_ref && !cpi->use_svc &&
-        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 100))
       rc->rate_correction_factors[GF_ARF_STD] = factor;
     else
       rc->rate_correction_factors[INTER_NORMAL] = factor;
@@ -519,6 +549,7 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
 int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
                       int active_best_quality, int active_worst_quality) {
   const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   int q = active_worst_quality;
   int last_error = INT_MAX;
   int i, target_bits_per_mb, bits_per_mb_at_this_q;
@@ -533,7 +564,8 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
 
   do {
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
-        cpi->svc.temporal_layer_id == 0) {
+        cr->apply_cyclic_refresh &&
+        (!cpi->oxcf.gf_cbr_boost_pct || !cpi->refresh_golden_frame)) {
       bits_per_mb_at_this_q =
           (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
     } else {
@@ -555,7 +587,9 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
 
   // In CBR mode, this makes sure q is between oscillating Qs to prevent
   // resonance.
-  if (cpi->oxcf.rc_mode == VPX_CBR &&
+  if (cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.reset_high_source_sad &&
+      (!cpi->oxcf.gf_cbr_boost_pct ||
+       !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) &&
       (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
       cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
     q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
@@ -640,7 +674,8 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
   int active_worst_quality;
   int ambient_qp;
   unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
-  if (cm->frame_type == KEY_FRAME) return rc->worst_quality;
+  if (cm->frame_type == KEY_FRAME || rc->reset_high_source_sad)
+    return rc->worst_quality;
   // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
   // for the first few frames following key frame. These are both initialized
   // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
@@ -650,6 +685,17 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
                    ? VPXMIN(rc->avg_frame_qindex[INTER_FRAME],
                             rc->avg_frame_qindex[KEY_FRAME])
                    : rc->avg_frame_qindex[INTER_FRAME];
+  // For SVC if the current base spatial layer was key frame, use the QP from
+  // that base layer for ambient_qp.
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) {
+    int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id,
+                                 cpi->svc.number_temporal_layers);
+    const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+    if (lc->is_key_frame) {
+      const RATE_CONTROL *lrc = &lc->rc;
+      ambient_qp = VPXMIN(ambient_qp, lrc->last_q[KEY_FRAME]);
+    }
+  }
   active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 >> 2);
   if (rc->buffer_level > rc->optimal_buffer_level) {
     // Adjust down.
@@ -961,6 +1007,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
       qdelta = vp9_compute_qdelta_by_rate(
           &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth);
     }
+    if (rc->high_source_sad && cpi->sf.use_altref_onepass) qdelta = 0;
     *top_index = active_worst_quality + qdelta;
     *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
   }
@@ -1289,6 +1336,28 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
   }
 }
 
+static void update_altref_usage(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int sum_ref_frame_usage = 0;
+  int arf_frame_usage = 0;
+  int mi_row, mi_col;
+  if (cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref &&
+      !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame)
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8) {
+        int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+        sum_ref_frame_usage += cpi->count_arf_frame_usage[sboffset] +
+                               cpi->count_lastgolden_frame_usage[sboffset];
+        arf_frame_usage += cpi->count_arf_frame_usage[sboffset];
+      }
+    }
+  if (sum_ref_frame_usage > 0) {
+    double altref_count = 100.0 * arf_frame_usage / sum_ref_frame_usage;
+    cpi->rc.perc_arf_usage =
+        0.75 * cpi->rc.perc_arf_usage + 0.25 * altref_count;
+  }
+}
+
 static void compute_frame_low_motion(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int mi_row, mi_col;
@@ -1314,10 +1383,6 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   RATE_CONTROL *const rc = &cpi->rc;
   const int qindex = cm->base_qindex;
 
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
-    vp9_cyclic_refresh_postencode(cpi);
-  }
-
   // Update rate control heuristics
   rc->projected_frame_size = (int)(bytes_used << 3);
 
@@ -1416,8 +1481,15 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   }
 
   if (oxcf->pass == 0) {
-    if (cm->frame_type != KEY_FRAME) compute_frame_low_motion(cpi);
+    if (cm->frame_type != KEY_FRAME) {
+      compute_frame_low_motion(cpi);
+      if (cpi->sf.use_altref_onepass) update_altref_usage(cpi);
+    }
+    cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref;
   }
+  if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0;
+
+  rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -1429,24 +1501,16 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
   cpi->rc.rc_1_frame = 0;
 }
 
-// Use this macro to turn on/off use of alt-refs in one-pass mode.
-#define USE_ALTREF_FOR_ONE_PASS 1
-
 static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
-  int target;
   const int af_ratio = rc->af_ratio_onepass_vbr;
-#if USE_ALTREF_FOR_ONE_PASS
-  target =
+  int target =
       (!rc->is_src_frame_alt_ref &&
        (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))
           ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
                 (rc->baseline_gf_interval + af_ratio - 1)
           : (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
                 (rc->baseline_gf_interval + af_ratio - 1);
-#else
-  target = rc->avg_frame_bandwidth;
-#endif
   return vp9_rc_clamp_pframe_target_size(cpi, target);
 }
 
@@ -1499,8 +1563,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0) {
       vp9_cyclic_refresh_set_golden_update(cpi);
     } else {
-      rc->baseline_gf_interval =
-          (rc->min_gf_interval + rc->max_gf_interval) / 2;
+      rc->baseline_gf_interval = VPXMIN(
+          20, VPXMAX(10, (rc->min_gf_interval + rc->max_gf_interval) / 2));
     }
     rc->af_ratio_onepass_vbr = 10;
     if (rc->rolling_target_bits > 0)
@@ -1518,14 +1582,20 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
       // Adjust boost and af_ratio based on avg_frame_low_motion, which varies
       // between 0 and 100 (stationary, 100% zero/small motion).
       rc->gfu_boost =
-          VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
-                          (rc->avg_frame_low_motion + 100));
+          VPXMAX(500,
+                 DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
+                     (rc->avg_frame_low_motion + 100));
       rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
     }
     adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
-    rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
+    rc->source_alt_ref_pending = 0;
+    rc->alt_ref_gf_group = 0;
+    if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) {
+      rc->source_alt_ref_pending = 1;
+      rc->alt_ref_gf_group = 1;
+    }
   }
   if (cm->frame_type == KEY_FRAME)
     target = calc_iframe_target_size_one_pass_vbr(cpi);
@@ -1802,6 +1872,26 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
 
     // Clamp min to max
     rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
+
+    if (oxcf->target_level == LEVEL_AUTO) {
+      const uint32_t pic_size = cpi->common.width * cpi->common.height;
+      const uint32_t pic_breadth =
+          VPXMAX(cpi->common.width, cpi->common.height);
+      int i;
+      for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
+        if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
+            vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
+          if (rc->min_gf_interval <=
+              (int)vp9_level_defs[i].min_altref_distance) {
+            rc->min_gf_interval =
+                (int)vp9_level_defs[i].min_altref_distance + 1;
+            rc->max_gf_interval =
+                VPXMAX(rc->max_gf_interval, rc->min_gf_interval);
+          }
+          break;
+        }
+      }
+    }
   }
 }
 
@@ -1888,9 +1978,11 @@ void vp9_set_target_rate(VP9_COMP *cpi) {
   else
     target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
 
-  // Correction to rate target based on prior over or under shoot.
-  if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
-    vbr_rate_correction(cpi, &target_rate);
+  if (!cpi->oxcf.vbr_corpus_complexity) {
+    // Correction to rate target based on prior over or under shoot.
+    if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
+      vbr_rate_correction(cpi, &target_rate);
+  }
   vp9_rc_set_frame_target(cpi, target_rate);
 }
 
@@ -2025,7 +2117,8 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
   return resize_action;
 }
 
-void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
+static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi,
+                                             uint64_t avg_sad_current) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int target;
@@ -2036,7 +2129,7 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
   uint64_t avg_source_sad_lag = avg_sad_current;
   int high_source_sad_lagindex = -1;
   int steady_sad_lagindex = -1;
-  uint32_t sad_thresh1 = 60000;
+  uint32_t sad_thresh1 = 70000;
   uint32_t sad_thresh2 = 120000;
   int low_content = 0;
   int high_content = 0;
@@ -2088,8 +2181,8 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
     rc->high_source_sad_lagindex = high_source_sad_lagindex;
   // Adjust some factors for the next GF group, ignore initial key frame,
   // and only for lag_in_frames not too small.
-  if (cpi->refresh_golden_frame == 1 && cm->frame_type != KEY_FRAME &&
-      cm->current_video_frame > 30 && cpi->oxcf.lag_in_frames > 8) {
+  if (cpi->refresh_golden_frame == 1 && cm->current_video_frame > 30 &&
+      cpi->oxcf.lag_in_frames > 8) {
     int frame_constraint;
     if (rc->rolling_target_bits > 0)
       rate_err =
@@ -2110,6 +2203,8 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
                                      ? VPXMAX(10, rc->baseline_gf_interval >> 1)
                                      : VPXMAX(6, rc->baseline_gf_interval >> 1);
     }
+    if (rc->baseline_gf_interval > cpi->oxcf.lag_in_frames - 1)
+      rc->baseline_gf_interval = cpi->oxcf.lag_in_frames - 1;
     // Check for constraining gf_interval for up-coming scene/content changes,
     // or for up-coming key frame, whichever is closer.
     frame_constraint = rc->frames_to_key;
@@ -2123,9 +2218,14 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
     // Adjust factors for active_worst setting & af_ratio for next gf interval.
     rc->fac_active_worst_inter = 150;  // corresponds to 3/2 (= 150 /100).
     rc->fac_active_worst_gf = 100;
-    if (rate_err < 1.5 && !high_content) {
+    if (rate_err < 2.0 && !high_content) {
       rc->fac_active_worst_inter = 120;
       rc->fac_active_worst_gf = 90;
+    } else if (rate_err > 8.0 && rc->avg_frame_qindex[INTER_FRAME] < 16) {
+      // Increase active_worst faster at low Q if rate fluctuation is high.
+      rc->fac_active_worst_inter = 200;
+      if (rc->avg_frame_qindex[INTER_FRAME] < 8)
+        rc->fac_active_worst_inter = 400;
     }
     if (low_content && rc->avg_frame_low_motion > 80) {
       rc->af_ratio_onepass_vbr = 15;
@@ -2133,6 +2233,30 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
       rc->af_ratio_onepass_vbr = 5;
       rc->gfu_boost = DEFAULT_GF_BOOST >> 2;
     }
+    if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) {
+      // Flag to disable usage of ARF based on past usage, only allow this
+      // disabling if current frame/group does not start with key frame or
+      // scene cut. Note perc_arf_usage is only computed for speed >= 5.
+      int arf_usage_low =
+          (cm->frame_type != KEY_FRAME && !rc->high_source_sad &&
+           cpi->rc.perc_arf_usage < 15 && cpi->oxcf.speed >= 5);
+      // Don't use alt-ref for this group under certain conditions.
+      if (arf_usage_low ||
+          (rc->high_source_sad_lagindex > 0 &&
+           rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) ||
+          (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) {
+        rc->source_alt_ref_pending = 0;
+        rc->alt_ref_gf_group = 0;
+      } else {
+        rc->source_alt_ref_pending = 1;
+        rc->alt_ref_gf_group = 1;
+        // If alt-ref is used for this gf group, limit the interval.
+        if (rc->baseline_gf_interval > 12) {
+          rc->baseline_gf_interval = 12;
+          rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+        }
+      }
+    }
     target = calc_pframe_target_size_one_pass_vbr(cpi);
     vp9_rc_set_frame_target(cpi, target);
   }
@@ -2144,9 +2268,12 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
 // in content and allow rate control to react.
 // This function also handles special case of lag_in_frames, to measure content
 // level in #future frames set by the lag_in_frames.
-void vp9_avg_source_sad(VP9_COMP *cpi) {
+void vp9_scene_detection_onepass(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) return;
+#endif
   rc->high_source_sad = 0;
   if (cpi->Last_Source != NULL &&
       cpi->Last_Source->y_width == cpi->Source->y_width &&
@@ -2159,11 +2286,14 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
     int start_frame = 0;
     int frames_to_buffer = 1;
     int frame = 0;
+    int scene_cut_force_key_frame = 0;
     uint64_t avg_sad_current = 0;
     uint32_t min_thresh = 4000;
     float thresh = 8.0f;
+    uint32_t thresh_key = 140000;
+    if (cpi->oxcf.speed <= 5) thresh_key = 240000;
     if (cpi->oxcf.rc_mode == VPX_VBR) {
-      min_thresh = 60000;
+      min_thresh = 65000;
       thresh = 2.1f;
     }
     if (cpi->oxcf.lag_in_frames > 0) {
@@ -2189,6 +2319,8 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
         rc->high_source_sad = 1;
       else
         rc->high_source_sad = 0;
+      if (rc->high_source_sad && avg_sad_current > thresh_key)
+        scene_cut_force_key_frame = 1;
       // Update recursive average for current frame.
       if (avg_sad_current > 0)
         rc->avg_source_sad[0] =
@@ -2208,6 +2340,7 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
         const BLOCK_SIZE bsize = BLOCK_64X64;
         // Loop over sub-sample of frame, compute average sad over 64x64 blocks.
         uint64_t avg_sad = 0;
+        uint64_t tmp_sad = 0;
         int num_samples = 0;
         int sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
         int sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
@@ -2220,13 +2353,14 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
         for (sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
           for (sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
             // Checker-board pattern, ignore boundary.
-            if ((sbi_row > 0 && sbi_col > 0) &&
-                (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
-                ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
-                 (sbi_row % 2 != 0 && sbi_col % 2 != 0))) {
+            if (((sbi_row > 0 && sbi_col > 0) &&
+                 (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
+                 ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
+                  (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) {
+              tmp_sad = cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+                                               last_src_ystride);
+              avg_sad += tmp_sad;
               num_samples++;
-              avg_sad += cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
-                                                last_src_ystride);
             }
             src_y += 64;
             last_src_y += 64;
@@ -2247,6 +2381,8 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
             rc->high_source_sad = 1;
           else
             rc->high_source_sad = 0;
+          if (rc->high_source_sad && avg_sad > thresh_key)
+            scene_cut_force_key_frame = 1;
           if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR)
             rc->avg_source_sad[0] = (3 * rc->avg_source_sad[0] + avg_sad) >> 2;
         } else {
@@ -2254,6 +2390,23 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
         }
       }
     }
+    // For CBR non-screen content mode, check if we should reset the rate
+    // control. Reset is done if high_source_sad is detected and the rate
+    // control is at very low QP with rate correction factor at min level.
+    if (cpi->oxcf.rc_mode == VPX_CBR &&
+        cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) {
+      if (rc->high_source_sad && rc->last_q[INTER_FRAME] == rc->best_quality &&
+          rc->avg_frame_qindex[INTER_FRAME] < (rc->best_quality << 1) &&
+          rc->rate_correction_factors[INTER_NORMAL] == MIN_BPB_FACTOR) {
+        rc->rate_correction_factors[INTER_NORMAL] = 0.5;
+        rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality;
+        rc->buffer_level = rc->optimal_buffer_level;
+        rc->bits_off_target = rc->optimal_buffer_level;
+        rc->reset_high_source_sad = 1;
+      }
+      if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad)
+        rc->this_frame_target = rc->avg_frame_bandwidth;
+    }
     // For VBR, under scene change/high content change, force golden refresh.
     if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME &&
         rc->high_source_sad && rc->frames_to_key > 3 &&
@@ -2261,6 +2414,10 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
         cpi->ext_refresh_frame_flags_pending == 0) {
       int target;
       cpi->refresh_golden_frame = 1;
+      if (scene_cut_force_key_frame) cm->frame_type = KEY_FRAME;
+      rc->source_alt_ref_pending = 0;
+      if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf)
+        rc->source_alt_ref_pending = 1;
       rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
       rc->baseline_gf_interval =
           VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval));
diff --git a/libs/libvpx/vp9/encoder/vp9_ratectrl.h b/libs/libvpx/vp9/encoder/vp9_ratectrl.h
index 6006e9b051..c1b210677e 100644
--- a/libs/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/libs/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -21,6 +21,9 @@
 extern "C" {
 #endif
 
+// Used to control aggressive VBR mode.
+// #define AGGRESSIVE_VBR 1
+
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS 9
 
@@ -29,6 +32,8 @@ extern "C" {
 #define FIXED_GF_INTERVAL 8  // Used in some testing modes only
 #define ONEHALFONLY_RESIZE 0
 
+#define FRAME_OVERHEAD_BITS 200
+
 typedef enum {
   INTER_NORMAL = 0,
   INTER_HIGH = 1,
@@ -147,6 +152,8 @@ typedef struct {
   int rc_2_frame;
   int q_1_frame;
   int q_2_frame;
+  // Keep track of the last target average frame bandwidth.
+  int last_avg_frame_bandwidth;
 
   // Auto frame-scaling variables.
   FRAME_SCALE_LEVEL frame_size_selector;
@@ -160,11 +167,15 @@ typedef struct {
   uint64_t avg_source_sad[MAX_LAG_BUFFERS];
   uint64_t prev_avg_source_sad_lag;
   int high_source_sad_lagindex;
+  int alt_ref_gf_group;
+  int last_frame_is_src_altref;
   int high_source_sad;
   int count_last_scene_change;
   int avg_frame_low_motion;
   int af_ratio_onepass_vbr;
   int force_qpmin;
+  int reset_high_source_sad;
+  double perc_arf_usage;
 } RATE_CONTROL;
 
 struct VP9_COMP;
@@ -178,6 +189,8 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
 
 double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth);
 
+int vp9_convert_q_to_qindex(double q_val, vpx_bit_depth_t bit_depth);
+
 void vp9_rc_init_minq_luts(void);
 
 int vp9_rc_get_default_min_gf_interval(int width, int height, double framerate);
@@ -277,7 +290,7 @@ void vp9_set_target_rate(struct VP9_COMP *cpi);
 
 int vp9_resize_one_pass_cbr(struct VP9_COMP *cpi);
 
-void vp9_avg_source_sad(struct VP9_COMP *cpi);
+void vp9_scene_detection_onepass(struct VP9_COMP *cpi);
 
 int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q);
 
diff --git a/libs/libvpx/vp9/encoder/vp9_rd.c b/libs/libvpx/vp9/encoder/vp9_rd.c
index 76c639a647..6b2306ce9b 100644
--- a/libs/libvpx/vp9/encoder/vp9_rd.c
+++ b/libs/libvpx/vp9/encoder/vp9_rd.c
@@ -146,7 +146,7 @@ static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
 static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
                                                               128, 144 };
 
-int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
+int64_t vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
   const int64_t q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
 #if CONFIG_VP9_HIGHBITDEPTH
   int64_t rdmult = 0;
@@ -161,6 +161,12 @@ int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
 #else
   int64_t rdmult = 88 * q * q / 24;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  return rdmult;
+}
+
+int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
+  int64_t rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex);
+
   if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
@@ -306,63 +312,62 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   }
 }
 
+// NOTE: The tables below must be of the same size.
+
+// The functions described below are sampled at the four most significant
+// bits of x^2 + 8 / 256.
+
+// Normalized rate:
+// This table models the rate for a Laplacian source with given variance
+// when quantized with a uniform quantizer with given stepsize. The
+// closed form expression is:
+// Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+// where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+// and H(x) is the binary entropy function.
+static const int rate_tab_q10[] = {
+  65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, 4044,
+  3958,  3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, 3133, 3037,
+  2952,  2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, 2290, 2232, 2179,
+  2130,  2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, 1608, 1530, 1460, 1398,
+  1342,  1290, 1243, 1199, 1159, 1086, 1021, 963,  911,  864,  821,  781,  745,
+  680,   623,  574,  530,  490,  455,  424,  395,  345,  304,  269,  239,  213,
+  190,   171,  154,  126,  104,  87,   73,   61,   52,   44,   38,   28,   21,
+  16,    12,   10,   8,    6,    5,    3,    2,    1,    1,    1,    0,    0,
+};
+
+// Normalized distortion:
+// This table models the normalized distortion for a Laplacian source
+// with given variance when quantized with a uniform quantizer
+// with given stepsize. The closed form expression is:
+// Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+// where x = qpstep / sqrt(variance).
+// Note the actual distortion is Dn * variance.
+static const int dist_tab_q10[] = {
+  0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    4,    5,    5,
+  6,    7,    7,    8,    9,    11,   12,   13,   15,   16,   17,   18,   21,
+  24,   26,   29,   31,   34,   36,   39,   44,   49,   54,   59,   64,   69,
+  73,   78,   88,   97,   106,  115,  124,  133,  142,  151,  167,  184,  200,
+  215,  231,  245,  260,  274,  301,  327,  351,  375,  397,  418,  439,  458,
+  495,  528,  559,  587,  613,  637,  659,  680,  717,  749,  777,  801,  823,
+  842,  859,  874,  899,  919,  936,  949,  960,  969,  977,  983,  994,  1001,
+  1006, 1010, 1013, 1015, 1017, 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
+};
+static const int xsq_iq_q10[] = {
+  0,      4,      8,      12,     16,     20,     24,     28,     32,
+  40,     48,     56,     64,     72,     80,     88,     96,     112,
+  128,    144,    160,    176,    192,    208,    224,    256,    288,
+  320,    352,    384,    416,    448,    480,    544,    608,    672,
+  736,    800,    864,    928,    992,    1120,   1248,   1376,   1504,
+  1632,   1760,   1888,   2016,   2272,   2528,   2784,   3040,   3296,
+  3552,   3808,   4064,   4576,   5088,   5600,   6112,   6624,   7136,
+  7648,   8160,   9184,   10208,  11232,  12256,  13280,  14304,  15328,
+  16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,  32736,
+  36832,  40928,  45024,  49120,  53216,  57312,  61408,  65504,  73696,
+  81888,  90080,  98272,  106464, 114656, 122848, 131040, 147424, 163808,
+  180192, 196576, 212960, 229344, 245728,
+};
+
 static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
-  // NOTE: The tables below must be of the same size.
-
-  // The functions described below are sampled at the four most significant
-  // bits of x^2 + 8 / 256.
-
-  // Normalized rate:
-  // This table models the rate for a Laplacian source with given variance
-  // when quantized with a uniform quantizer with given stepsize. The
-  // closed form expression is:
-  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
-  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
-  // and H(x) is the binary entropy function.
-  static const int rate_tab_q10[] = {
-    65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142,
-    4044,  3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186,
-    3133,  3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353,
-    2290,  2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651,
-    1608,  1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963,
-    911,   864,  821,  781,  745,  680,  623,  574,  530,  490,  455,  424,
-    395,   345,  304,  269,  239,  213,  190,  171,  154,  126,  104,  87,
-    73,    61,   52,   44,   38,   28,   21,   16,   12,   10,   8,    6,
-    5,     3,    2,    1,    1,    1,    0,    0,
-  };
-
-  // Normalized distortion:
-  // This table models the normalized distortion for a Laplacian source
-  // with given variance when quantized with a uniform quantizer
-  // with given stepsize. The closed form expression is:
-  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
-  // where x = qpstep / sqrt(variance).
-  // Note the actual distortion is Dn * variance.
-  static const int dist_tab_q10[] = {
-    0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    4,    5,
-    5,    6,    7,    7,    8,    9,    11,   12,   13,   15,   16,   17,
-    18,   21,   24,   26,   29,   31,   34,   36,   39,   44,   49,   54,
-    59,   64,   69,   73,   78,   88,   97,   106,  115,  124,  133,  142,
-    151,  167,  184,  200,  215,  231,  245,  260,  274,  301,  327,  351,
-    375,  397,  418,  439,  458,  495,  528,  559,  587,  613,  637,  659,
-    680,  717,  749,  777,  801,  823,  842,  859,  874,  899,  919,  936,
-    949,  960,  969,  977,  983,  994,  1001, 1006, 1010, 1013, 1015, 1017,
-    1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
-  };
-  static const int xsq_iq_q10[] = {
-    0,      4,      8,      12,     16,     20,     24,     28,     32,
-    40,     48,     56,     64,     72,     80,     88,     96,     112,
-    128,    144,    160,    176,    192,    208,    224,    256,    288,
-    320,    352,    384,    416,    448,    480,    544,    608,    672,
-    736,    800,    864,    928,    992,    1120,   1248,   1376,   1504,
-    1632,   1760,   1888,   2016,   2272,   2528,   2784,   3040,   3296,
-    3552,   3808,   4064,   4576,   5088,   5600,   6112,   6624,   7136,
-    7648,   8160,   9184,   10208,  11232,  12256,  13280,  14304,  15328,
-    16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,  32736,
-    36832,  40928,  45024,  49120,  53216,  57312,  61408,  65504,  73696,
-    81888,  90080,  98272,  106464, 114656, 122848, 131040, 147424, 163808,
-    180192, 196576, 212960, 229344, 245728,
-  };
   const int tmp = (xsq_q10 >> 2) + 8;
   const int k = get_msb(tmp) - 3;
   const int xq = (k << 3) + ((tmp >> k) & 0x7);
@@ -373,6 +378,24 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 }
 
+static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE],
+                              int r_q10[MAX_MB_PLANE],
+                              int d_q10[MAX_MB_PLANE]) {
+  int i;
+  const int one_q10 = 1 << 10;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    const int tmp = (xsq_q10[i] >> 2) + 8;
+    const int k = get_msb(tmp) - 3;
+    const int xq = (k << 3) + ((tmp >> k) & 0x7);
+    const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k);
+    const int b_q10 = one_q10 - a_q10;
+    r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
+    d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
+  }
+}
+
+static const uint32_t MAX_XSQ_Q10 = 245727;
+
 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
                                   unsigned int qstep, int *rate,
                                   int64_t *dist) {
@@ -387,7 +410,6 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
     *dist = 0;
   } else {
     int d_q10, r_q10;
-    static const uint32_t MAX_XSQ_Q10 = 245727;
     const uint64_t xsq_q10_64 =
         (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
     const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
@@ -397,6 +419,30 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
   }
 }
 
+// Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where
+// vectors are of length MAX_MB_PLANE and all elements of var are non-zero.
+void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
+                                      unsigned int n_log2[MAX_MB_PLANE],
+                                      unsigned int qstep[MAX_MB_PLANE],
+                                      int64_t *rate_sum, int64_t *dist_sum) {
+  int i;
+  int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE];
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    const uint64_t xsq_q10_64 =
+        (((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) /
+        var[i];
+    xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
+  }
+  model_rd_norm_vec(xsq_q10, r_q10, d_q10);
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    int rate =
+        ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT);
+    int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10;
+    *rate_sum += rate;
+    *dist_sum += dist;
+  }
+}
+
 void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
                               ENTROPY_CONTEXT t_above[16],
@@ -624,19 +670,21 @@ void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
   }
 }
 
-int vp9_get_intra_cost_penalty(int qindex, int qdelta,
-                               vpx_bit_depth_t bit_depth) {
-  const int q = vp9_dc_quant(qindex, qdelta, bit_depth);
-#if CONFIG_VP9_HIGHBITDEPTH
-  switch (bit_depth) {
-    case VPX_BITS_8: return 20 * q;
-    case VPX_BITS_10: return 5 * q;
-    case VPX_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2);
-    default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1;
-  }
-#else
-  return 20 * q;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+int vp9_get_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize,
+                               int qindex, int qdelta) {
+  // Reduce the intra cost penalty for small blocks (<=16x16).
+  int reduction_fac =
+      (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
+
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh)
+    // Don't reduce intra cost penalty if estimated noise level is high.
+    reduction_fac = 0;
+
+  // Always use VPX_BITS_8 as input here because the penalty is applied
+  // to rate not distortion so we want a consistent penalty for all bit
+  // depths. If the actual bit depth were passed in here then the value
+  // retured by vp9_dc_quant() would scale with the bit depth and we would
+  // then need to apply inverse scaling to correct back to a bit depth
+  // independent rate penalty.
+  return (20 * vp9_dc_quant(qindex, qdelta, VPX_BITS_8)) >> reduction_fac;
 }
diff --git a/libs/libvpx/vp9/encoder/vp9_rd.h b/libs/libvpx/vp9/encoder/vp9_rd.h
index 05344b6cf0..59022c106e 100644
--- a/libs/libvpx/vp9/encoder/vp9_rd.h
+++ b/libs/libvpx/vp9/encoder/vp9_rd.h
@@ -38,6 +38,7 @@ extern "C" {
 #define MAX_MODES 30
 #define MAX_REFS 6
 
+#define RD_THRESH_INIT_FACT 32
 #define RD_THRESH_MAX_FACT 64
 #define RD_THRESH_INC 1
 
@@ -128,6 +129,9 @@ struct TileDataEnc;
 struct VP9_COMP;
 struct macroblock;
 
+int64_t vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi,
+                                            int qindex);
+
 int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex);
 
 void vp9_initialize_rd_consts(struct VP9_COMP *cpi);
@@ -137,6 +141,11 @@ void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
                                   unsigned int qstep, int *rate, int64_t *dist);
 
+void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
+                                      unsigned int n_log2[MAX_MB_PLANE],
+                                      unsigned int qstep[MAX_MB_PLANE],
+                                      int64_t *rate_sum, int64_t *dist_sum);
+
 int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
                             const MACROBLOCKD *const xd);
 
@@ -164,8 +173,8 @@ void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize,
                                int best_mode_index);
 
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
-                                      int thresh_fact) {
-  return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+                                      const int *const thresh_fact) {
+  return best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX;
 }
 
 static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
@@ -182,13 +191,18 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd,
                           const struct scale_factors *scale,
                           const struct scale_factors *scale_uv);
 
-int vp9_get_intra_cost_penalty(int qindex, int qdelta,
-                               vpx_bit_depth_t bit_depth);
+int vp9_get_intra_cost_penalty(const struct VP9_COMP *const cpi,
+                               BLOCK_SIZE bsize, int qindex, int qdelta);
 
+unsigned int vp9_get_sby_variance(struct VP9_COMP *cpi,
+                                  const struct buf_2d *ref, BLOCK_SIZE bs);
 unsigned int vp9_get_sby_perpixel_variance(struct VP9_COMP *cpi,
                                            const struct buf_2d *ref,
                                            BLOCK_SIZE bs);
 #if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_variance(struct VP9_COMP *cpi,
+                                       const struct buf_2d *ref, BLOCK_SIZE bs,
+                                       int bd);
 unsigned int vp9_high_get_sby_perpixel_variance(struct VP9_COMP *cpi,
                                                 const struct buf_2d *ref,
                                                 BLOCK_SIZE bs, int bd);
diff --git a/libs/libvpx/vp9/encoder/vp9_rdopt.c b/libs/libvpx/vp9/encoder/vp9_rdopt.c
index 27d4e9d6d3..2ba6378c5e 100644
--- a/libs/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libs/libvpx/vp9/encoder/vp9_rdopt.c
@@ -164,17 +164,19 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
   const int ref = xd->mi[0]->ref_frame[0];
   unsigned int sse;
   unsigned int var = 0;
-  unsigned int sum_sse = 0;
   int64_t total_sse = 0;
   int skip_flag = 1;
   const int shift = 6;
-  int rate;
   int64_t dist;
   const int dequant_shift =
 #if CONFIG_VP9_HIGHBITDEPTH
       (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
                                                     3;
+  unsigned int qstep_vec[MAX_MB_PLANE];
+  unsigned int nlog2_vec[MAX_MB_PLANE];
+  unsigned int sum_sse_vec[MAX_MB_PLANE];
+  int any_zero_sum_sse = 0;
 
   x->pred_sse[ref] = 0;
 
@@ -186,6 +188,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
     const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
     const int64_t dc_thr = p->quant_thred[0] >> shift;
     const int64_t ac_thr = p->quant_thred[1] >> shift;
+    unsigned int sum_sse = 0;
     // The low thresholds are used to measure if the prediction errors are
     // low enough so that we can skip the mode search.
     const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2);
@@ -196,8 +199,6 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
     int lw = b_width_log2_lookup[unit_size] + 2;
     int lh = b_height_log2_lookup[unit_size] + 2;
 
-    sum_sse = 0;
-
     for (idy = 0; idy < bh; ++idy) {
       for (idx = 0; idx < bw; ++idx) {
         uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
@@ -233,12 +234,18 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
     }
 
     total_sse += sum_sse;
+    sum_sse_vec[i] = sum_sse;
+    any_zero_sum_sse = any_zero_sum_sse || (sum_sse == 0);
+    qstep_vec[i] = pd->dequant[1] >> dequant_shift;
+    nlog2_vec[i] = num_pels_log2_lookup[bs];
+  }
 
-    // Fast approximate the modelling function.
-    if (cpi->sf.simple_model_rd_from_var) {
+  // Fast approximate the modelling function.
+  if (cpi->sf.simple_model_rd_from_var) {
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
       int64_t rate;
-      const int64_t square_error = sum_sse;
-      int quantizer = (pd->dequant[1] >> dequant_shift);
+      const int64_t square_error = sum_sse_vec[i];
+      int quantizer = qstep_vec[i];
 
       if (quantizer < 120)
         rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT);
@@ -247,12 +254,19 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
       dist = (square_error * quantizer) >> 8;
       rate_sum += rate;
       dist_sum += dist;
+    }
+  } else {
+    if (any_zero_sum_sse) {
+      for (i = 0; i < MAX_MB_PLANE; ++i) {
+        int rate;
+        vp9_model_rd_from_var_lapndz(sum_sse_vec[i], nlog2_vec[i], qstep_vec[i],
+                                     &rate, &dist);
+        rate_sum += rate;
+        dist_sum += dist;
+      }
     } else {
-      vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
-                                   pd->dequant[1] >> dequant_shift, &rate,
-                                   &dist);
-      rate_sum += rate;
-      dist_sum += dist;
+      vp9_model_rd_from_var_lapndz_vec(sum_sse_vec, nlog2_vec, qstep_vec,
+                                       &rate_sum, &dist_sum);
     }
   }
 
@@ -284,22 +298,12 @@ int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
   return error;
 }
 
-int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
-                                      const tran_low_t *dqcoeff,
-                                      intptr_t block_size, int64_t *ssz) {
-  // Note that the C versions of these 2 functions (vp9_block_error and
-  // vp9_highbd_block_error_8bit are the same, but the optimized assembly
-  // routines are not compatible in the non high bitdepth configuration, so
-  // they still cannot share the same name.
-  return vp9_block_error_c(coeff, dqcoeff, block_size, ssz);
-}
-
 static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
                                                const tran_low_t *dqcoeff,
                                                intptr_t block_size,
                                                int64_t *ssz, int bd) {
   if (bd == 8) {
-    return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz);
+    return vp9_block_error(coeff, dqcoeff, block_size, ssz);
   } else {
     return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd);
   }
@@ -321,7 +325,7 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
   return error;
 }
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                              int block_size) {
   int i;
   int64_t error = 0;
@@ -358,11 +362,11 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
   unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
       x->token_costs[tx_size][type][is_inter_block(mi)];
   uint8_t token_cache[32 * 32];
-  int c, cost;
+  int cost;
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
 #else
-  const int *cat6_high_cost = vp9_get_high_cost_table(8);
+  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8);
 #endif
 
   // Check for consistency of tx_size with mode info
@@ -373,10 +377,10 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
   if (eob == 0) {
     // single eob token
     cost = token_costs[0][0][pt][EOB_TOKEN];
-    c = 0;
   } else {
     if (use_fast_coef_costing) {
       int band_left = *band_count++;
+      int c;
 
       // dc token
       int v = qcoeff[0];
@@ -407,6 +411,7 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
 
     } else {  // !use_fast_coef_costing
       int band_left = *band_count++;
+      int c;
 
       // dc token
       int v = qcoeff[0];
@@ -510,7 +515,8 @@ static int64_t sum_squares_visible(const MACROBLOCKD *xd,
                                              pd->subsampling_y, blk_row);
   if (tx_bsize == BLOCK_4X4 ||
       (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
-    sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_bsize);
+    assert(tx_4x4_w == tx_4x4_h);
+    sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2);
   } else {
     int r, c;
     int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
@@ -520,7 +526,8 @@ static int64_t sum_squares_visible(const MACROBLOCKD *xd,
     for (r = 0; r < max_r; ++r) {
       // Skip visiting the sub blocks that are wholly within the UMV.
       for (c = 0; c < max_c; ++c) {
-        sse += (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, BLOCK_4X4);
+        sse += (int64_t)vpx_sum_squares_2d_i16(
+            diff + r * diff_stride * 4 + c * 4, diff_stride, 4);
       }
     }
   }
@@ -592,31 +599,31 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
 
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        recon = CONVERT_TO_BYTEPTR(recon);
-        vpx_highbd_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0,
-                                 bs, bs, xd->bd);
+        vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
+                                 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd);
         if (xd->lossless) {
-          vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
+          vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
         } else {
           switch (tx_size) {
             case TX_4X4:
-              vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
+              vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
               break;
             case TX_8X8:
-              vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd);
+              vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd);
               break;
             case TX_16X16:
-              vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd);
+              vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd);
               break;
             case TX_32X32:
-              vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd);
+              vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd);
               break;
             default: assert(0 && "Invalid transform size");
           }
         }
+        recon = CONVERT_TO_BYTEPTR(recon16);
       } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs);
+        vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs);
         switch (tx_size) {
           case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break;
           case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break;
@@ -625,7 +632,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
             // this is like vp9_short_idct4x4 but has a special case around
             // eob<=1, which is significant (not just an optimization) for
             // the lossless case.
-            x->itxm_add(dqcoeff, recon, 32, *eob);
+            x->inv_txfm_add(dqcoeff, recon, 32, *eob);
             break;
           default: assert(0 && "Invalid transform size"); break;
         }
@@ -723,7 +730,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
       }
     } else {
       // SKIP_TXFM_AC_DC
-      // skip forward transform
+      // skip forward transform. Because this is handled here, the quantization
+      // does not need to do it.
       x->plane[plane].eobs[block] = 0;
       sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
       dist = sse;
@@ -751,9 +759,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 
   // TODO(jingning): temporarily enabled only for luma component
   rd = VPXMIN(rd1, rd2);
-  if (plane == 0)
+  if (plane == 0) {
     x->zcoeff_blk[tx_size][block] =
         !x->plane[plane].eobs[block] || (rd1 > rd2 && !xd->lossless);
+    x->sum_y_eobs[tx_size] += x->plane[plane].eobs[block];
+  }
 
   args->this_rate += rate;
   args->this_dist += dist;
@@ -995,6 +1005,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           const int block = (row + idy) * 2 + (col + idx);
           const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+          uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
           int16_t *const src_diff =
               vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
           tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
@@ -1016,7 +1027,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
             tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);
             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
               goto next_highbd;
-            vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst,
+            vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst16,
                                    dst_stride, p->eobs[block], xd->bd);
           } else {
             int64_t unused;
@@ -1039,7 +1050,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
               goto next_highbd;
             vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
-                                  dst, dst_stride, p->eobs[block], xd->bd);
+                                  dst16, dst_stride, p->eobs[block], xd->bd);
           }
         }
       }
@@ -1129,16 +1140,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                so->neighbors, cpi->sf.use_fast_coef_costing);
           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
-#if CONFIG_VP9_HIGHBITDEPTH
-          distortion +=
-              vp9_highbd_block_error_8bit(
-                  coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >>
-              2;
-#else
           distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                                         16, &unused) >>
                         2;
-#endif
           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
             goto next;
           vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), dst,
@@ -1526,7 +1530,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       vp9_highbd_build_inter_predictor(
-          pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv,
+          CONVERT_TO_SHORTPTR(pre), y_stride, CONVERT_TO_SHORTPTR(dst),
+          pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv,
           &xd->block_refs[ref]->sf, width, height, ref, kernel, MV_PRECISION_Q3,
           mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2),
           xd->bd);
@@ -1572,8 +1577,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
       k += (idy * 2 + idx);
       coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]);
       coeff = BLOCK_OFFSET(p->coeff, k);
-      x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
-                    coeff, 8);
+      x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
+                     coeff, 8);
       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
 #if CONFIG_VP9_HIGHBITDEPTH
       thisdistortion += vp9_highbd_block_error_dispatch(
@@ -1781,9 +1786,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
       vp9_highbd_build_inter_predictor(
-          ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-          &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, kernel, MV_PRECISION_Q3,
-          mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd);
+          CONVERT_TO_SHORTPTR(ref_yv12[!id].buf), ref_yv12[!id].stride,
+          second_pred_alloc_16, pw, &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0,
+          kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd);
     } else {
       second_pred = (uint8_t *)second_pred_alloc_16;
       vp9_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
@@ -1998,7 +2003,8 @@ static int64_t rd_pick_best_sub8x8_mode(
           vp9_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv);
 
           bestsme = vp9_full_pixel_search(
-              cpi, x, bsize, &mvp_full, step_param, sadpb,
+              cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method,
+              sadpb,
               sf->mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
               &bsi->ref_mv[0]->as_mv, new_mv, INT_MAX, 1);
 
@@ -2403,9 +2409,9 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
-  bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
-                                  cond_cost_list(cpi, cost_list), &ref_mv,
-                                  &tmp_mv->as_mv, INT_MAX, 1);
+  bestsme = vp9_full_pixel_search(
+      cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb,
+      cond_cost_list(cpi, cost_list), &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
 
   x->mv_limits = tmp_mv_limits;
 
@@ -2870,57 +2876,82 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
 
 // This function is designed to apply a bias or adjustment to an rd value based
 // on the relative variance of the source and reconstruction.
-#define LOW_VAR_THRESH 16
-#define VLOW_ADJ_MAX 25
-#define VHIGH_ADJ_MAX 8
+#define VERY_LOW_VAR_THRESH 2
+#define LOW_VAR_THRESH 5
+#define VAR_MULT 100
+static unsigned int max_var_adjust[VP9E_CONTENT_INVALID] = { 16, 16, 100 };
+
 static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x,
                                    BLOCK_SIZE bsize, int64_t *this_rd,
                                    MV_REFERENCE_FRAME ref_frame,
                                    unsigned int source_variance) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  unsigned int recon_variance;
+  unsigned int rec_variance;
+  unsigned int src_variance;
+  unsigned int src_rec_min;
   unsigned int absvar_diff = 0;
-  int64_t var_error = 0;
-  int64_t var_factor = 0;
+  unsigned int var_factor = 0;
+  unsigned int adj_max;
+  vp9e_tune_content content_type = cpi->oxcf.content;
 
   if (*this_rd == INT64_MAX) return;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    recon_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst,
+    if (source_variance > 0) {
+      rec_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst,
                                                         bsize, xd->bd);
+      src_variance = source_variance;
+    } else {
+      rec_variance =
+          vp9_high_get_sby_variance(cpi, &xd->plane[0].dst, bsize, xd->bd);
+      src_variance =
+          vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, xd->bd);
+    }
   } else {
-    recon_variance =
-        vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+    if (source_variance > 0) {
+      rec_variance =
+          vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+      src_variance = source_variance;
+    } else {
+      rec_variance = vp9_get_sby_variance(cpi, &xd->plane[0].dst, bsize);
+      src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize);
+    }
   }
 #else
-  recon_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+  if (source_variance > 0) {
+    rec_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+    src_variance = source_variance;
+  } else {
+    rec_variance = vp9_get_sby_variance(cpi, &xd->plane[0].dst, bsize);
+    src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize);
+  }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  if ((source_variance + recon_variance) > LOW_VAR_THRESH) {
-    absvar_diff = (source_variance > recon_variance)
-                      ? (source_variance - recon_variance)
-                      : (recon_variance - source_variance);
+  // Lower of source (raw per pixel value) and recon variance. Note that
+  // if the source per pixel is 0 then the recon value here will not be per
+  // pixel (see above) so will likely be much larger.
+  src_rec_min = VPXMIN(source_variance, rec_variance);
 
-    var_error = ((int64_t)200 * source_variance * recon_variance) /
-                (((int64_t)source_variance * source_variance) +
-                 ((int64_t)recon_variance * recon_variance));
-    var_error = 100 - var_error;
-  }
+  if (src_rec_min > LOW_VAR_THRESH) return;
+
+  absvar_diff = (src_variance > rec_variance) ? (src_variance - rec_variance)
+                                              : (rec_variance - src_variance);
+
+  adj_max = max_var_adjust[content_type];
+
+  var_factor =
+      (unsigned int)((int64_t)VAR_MULT * absvar_diff) / VPXMAX(1, src_variance);
+  var_factor = VPXMIN(adj_max, var_factor);
 
-  // Source variance above a threshold and ref frame is intra.
-  // This case is targeted mainly at discouraging intra modes that give rise
-  // to a predictor with a low spatial complexity compared to the source.
-  if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) &&
-      (source_variance > recon_variance)) {
-    var_factor = VPXMIN(absvar_diff, VPXMIN(VLOW_ADJ_MAX, var_error));
-    // A second possible case of interest is where the source variance
-    // is very low and we wish to discourage false texture or motion trails.
-  } else if ((source_variance < (LOW_VAR_THRESH >> 1)) &&
-             (recon_variance > source_variance)) {
-    var_factor = VPXMIN(absvar_diff, VPXMIN(VHIGH_ADJ_MAX, var_error));
-  }
   *this_rd += (*this_rd * var_factor) / 100;
+
+  if (content_type == VP9E_CONTENT_FILM) {
+    if (src_rec_min <= VERY_LOW_VAR_THRESH) {
+      if (ref_frame == INTRA_FRAME) *this_rd *= 2;
+      if (bsize > 6) *this_rd *= 2;
+    }
+  }
 }
 
 // Do we have an internal image edge (e.g. formatting bars).
@@ -3032,8 +3063,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
   PREDICTION_MODE mode_uv[TX_SIZES];
-  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int intra_cost_penalty =
+      vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
   int best_skip2 = 0;
   uint8_t ref_frame_skip_mask[2] = { 0 };
   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
@@ -3041,7 +3072,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
   int64_t mode_threshold[MAX_MODES];
-  int *mode_map = tile_data->mode_map[bsize];
+  int *tile_mode_map = tile_data->mode_map[bsize];
+  int mode_map[MAX_MODES];  // Maintain mode_map information locally to avoid
+                            // lock mechanism involved with reads from
+                            // tile_mode_map
   const int mode_search_skip_flags = sf->mode_search_skip_flags;
   int64_t mask_filter = 0;
   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
@@ -3153,23 +3187,28 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
       ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
 
   for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
+
   for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
     mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
 
   midx = sf->schedule_mode_search ? mode_skip_start : 0;
+
   while (midx > 4) {
     uint8_t end_pos = 0;
     for (i = 5; i < midx; ++i) {
-      if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) {
-        uint8_t tmp = mode_map[i];
-        mode_map[i] = mode_map[i - 1];
-        mode_map[i - 1] = tmp;
+      if (mode_threshold[tile_mode_map[i - 1]] >
+          mode_threshold[tile_mode_map[i]]) {
+        uint8_t tmp = tile_mode_map[i];
+        tile_mode_map[i] = tile_mode_map[i - 1];
+        tile_mode_map[i - 1] = tmp;
         end_pos = i;
       }
     }
     midx = end_pos;
   }
 
+  memcpy(mode_map, tile_mode_map, sizeof(mode_map));
+
   for (midx = 0; midx < MAX_MODES; ++midx) {
     int mode_index = mode_map[midx];
     int mode_excluded = 0;
@@ -3187,6 +3226,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
+    vp9_zero(x->sum_y_eobs);
+
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
     if (midx == mode_skip_start && best_mode_index >= 0) {
@@ -3218,6 +3259,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
     if (best_rd < mode_threshold[mode_index]) continue;
 
+    // This is only used in motion vector unit test.
+    if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
+
     if (sf->motion_field_mode_search) {
       const int mi_width = VPXMIN(num_8x8_blocks_wide_lookup[bsize],
                                   tile_info->mi_col_end - mi_col);
@@ -3466,6 +3510,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
         memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mi->tx_size],
                sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+        ctx->sum_y_eobs = x->sum_y_eobs[mi->tx_size];
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
@@ -3571,6 +3616,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
+    // If adaptive interp filter is enabled, then the current leaf node of 8x8
+    // data is needed for sub8x8. Hence preserve the context.
+    if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
     return;
@@ -3689,6 +3737,8 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data,
   mi->mv[0].as_int = 0;
   x->skip = 1;
 
+  ctx->sum_y_eobs = 0;
+
   if (cm->interp_filter != BILINEAR) {
     best_filter = EIGHTTAP;
     if (cm->interp_filter == SWITCHABLE &&
@@ -3777,8 +3827,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   int64_t dist_uv;
   int skip_uv;
   PREDICTION_MODE mode_uv = DC_PRED;
-  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int intra_cost_penalty =
+      vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
@@ -3787,6 +3837,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
   int internal_active_edge =
       vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi);
+  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
 
   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   memset(x->zcoeff_blk[TX_4X4], 0, 4);
@@ -3838,6 +3889,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
     ref_frame = vp9_ref_order[ref_index].ref_frame[0];
     second_ref_frame = vp9_ref_order[ref_index].ref_frame[1];
 
+    vp9_zero(x->sum_y_eobs);
+
 #if CONFIG_BETTER_HW_COMPATIBILITY
     // forbid 8X4 and 4X8 partitions if any reference frame is scaled.
     if (bsize == BLOCK_8X4 || bsize == BLOCK_4X8) {
@@ -3878,9 +3931,12 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
     if (!internal_active_edge &&
         rd_less_than_thresh(best_rd,
                             rd_opt->threshes[segment_id][bsize][ref_index],
-                            tile_data->thresh_freq_fact[bsize][ref_index]))
+                            &rd_thresh_freq_fact[ref_index]))
       continue;
 
+    // This is only used in motion vector unit test.
+    if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
+
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
@@ -4051,6 +4107,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
               for (i = 0; i < 4; i++) {
                 tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
                 x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
+                x->sum_y_eobs[TX_4X4] += x->plane[0].eobs[i];
               }
               pred_exists = 1;
               if (switchable_filter_index == 0 && sf->use_rd_breakout &&
@@ -4215,6 +4272,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
         if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
         memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
                sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+        ctx->sum_y_eobs = x->sum_y_eobs[TX_4X4];
 
         for (i = 0; i < 4; i++) best_bmodes[i] = xd->mi[0]->bmi[i];
 
diff --git a/libs/libvpx/vp9/encoder/vp9_skin_detection.c b/libs/libvpx/vp9/encoder/vp9_skin_detection.c
index 3f3d48fb9f..cc6c967767 100644
--- a/libs/libvpx/vp9/encoder/vp9_skin_detection.c
+++ b/libs/libvpx/vp9/encoder/vp9_skin_detection.c
@@ -15,75 +15,6 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_skin_detection.h"
 
-#define MODEL_MODE 1
-
-// Fixed-point skin color model parameters.
-static const int skin_mean[5][2] = { { 7463, 9614 },
-                                     { 6400, 10240 },
-                                     { 7040, 10240 },
-                                     { 8320, 9280 },
-                                     { 6800, 9614 } };
-static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 };  // q16
-static const int skin_threshold[6] = { 1570636, 1400000, 800000,
-                                       800000,  800000,  800000 };  // q18
-
-// Thresholds on luminance.
-static const int y_low = 40;
-static const int y_high = 220;
-
-// Evaluates the Mahalanobis distance measure for the input CbCr values.
-static int evaluate_skin_color_difference(int cb, int cr, int idx) {
-  const int cb_q6 = cb << 6;
-  const int cr_q6 = cr << 6;
-  const int cb_diff_q12 =
-      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
-  const int cbcr_diff_q12 =
-      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
-  const int cr_diff_q12 =
-      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
-  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
-  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
-  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
-  const int skin_diff =
-      skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
-      skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
-  return skin_diff;
-}
-
-int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr,
-                   int motion) {
-  if (y < y_low || y > y_high) {
-    return 0;
-  } else {
-    if (MODEL_MODE == 0) {
-      return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
-    } else {
-      int i = 0;
-      // Exit on grey.
-      if (cb == 128 && cr == 128) return 0;
-      // Exit on very strong cb.
-      if (cb > 150 && cr < 110) return 0;
-      for (; i < 5; i++) {
-        int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
-        if (skin_color_diff < skin_threshold[i + 1]) {
-          if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2))
-            return 0;
-          else if (motion == 0 &&
-                   skin_color_diff > (skin_threshold[i + 1] >> 1))
-            return 0;
-          else
-            return 1;
-        }
-        // Exit if difference is much large than the threshold.
-        if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
-          return 0;
-        }
-      }
-      return 0;
-    }
-  }
-}
-
 int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
                            int stride, int strideuv, int bsize,
                            int consec_zeromv, int curr_motion_magn) {
@@ -100,31 +31,113 @@ int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
     const uint8_t ysource = y[y_height_shift * stride + y_width_shift];
     const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift];
     const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift];
+
     if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0;
-    return vp9_skin_pixel(ysource, usource, vsource, motion);
+    return vpx_skin_pixel(ysource, usource, vsource, motion);
   }
 }
 
-#ifdef OUTPUT_YUV_SKINMAP
-// For viewing skin map on input source.
-void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
-  int i, j, mi_row, mi_col, num_bl;
+void vp9_compute_skin_sb(VP9_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                         int mi_col) {
+  int i, j, num_bl;
   VP9_COMMON *const cm = &cpi->common;
-  uint8_t *y;
   const uint8_t *src_y = cpi->Source->y_buffer;
   const uint8_t *src_u = cpi->Source->u_buffer;
   const uint8_t *src_v = cpi->Source->v_buffer;
   const int src_ystride = cpi->Source->y_stride;
   const int src_uvstride = cpi->Source->uv_stride;
-  int y_bsize = 16;  // Use 8x8 or 16x16.
-  int uv_bsize = y_bsize >> 1;
-  int ypos = y_bsize >> 1;
-  int uvpos = uv_bsize >> 1;
-  int shy = (y_bsize == 8) ? 3 : 4;
-  int shuv = shy - 1;
-  int fac = y_bsize / 8;
-  // Use center pixel or average of center 2x2 pixels.
-  int mode_filter = 0;
+  const int y_bsize = 4 << b_width_log2_lookup[bsize];
+  const int uv_bsize = y_bsize >> 1;
+  const int shy = (y_bsize == 8) ? 3 : 4;
+  const int shuv = shy - 1;
+  const int fac = y_bsize / 8;
+  const int y_shift = src_ystride * (mi_row << 3) + (mi_col << 3);
+  const int uv_shift = src_uvstride * (mi_row << 2) + (mi_col << 2);
+  const int mi_row_limit = VPXMIN(mi_row + 8, cm->mi_rows - 2);
+  const int mi_col_limit = VPXMIN(mi_col + 8, cm->mi_cols - 2);
+  src_y += y_shift;
+  src_u += uv_shift;
+  src_v += uv_shift;
+
+  for (i = mi_row; i < mi_row_limit; i += fac) {
+    num_bl = 0;
+    for (j = mi_col; j < mi_col_limit; j += fac) {
+      int consec_zeromv = 0;
+      int bl_index = i * cm->mi_cols + j;
+      int bl_index1 = bl_index + 1;
+      int bl_index2 = bl_index + cm->mi_cols;
+      int bl_index3 = bl_index2 + 1;
+      // Don't detect skin on the boundary.
+      if (i == 0 || j == 0) continue;
+      if (bsize == BLOCK_8X8)
+        consec_zeromv = cpi->consec_zero_mv[bl_index];
+      else
+        consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
+                               VPXMIN(cpi->consec_zero_mv[bl_index1],
+                                      VPXMIN(cpi->consec_zero_mv[bl_index2],
+                                             cpi->consec_zero_mv[bl_index3])));
+      cpi->skin_map[bl_index] =
+          vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride,
+                                 bsize, consec_zeromv, 0);
+      num_bl++;
+      src_y += y_bsize;
+      src_u += uv_bsize;
+      src_v += uv_bsize;
+    }
+    src_y += (src_ystride << shy) - (num_bl << shy);
+    src_u += (src_uvstride << shuv) - (num_bl << shuv);
+    src_v += (src_uvstride << shuv) - (num_bl << shuv);
+  }
+
+  // Remove isolated skin blocks (none of its neighbors are skin) and isolated
+  // non-skin blocks (all of its neighbors are skin).
+  // Skip 4 corner blocks which have only 3 neighbors to remove isolated skin
+  // blocks. Skip superblock borders to remove isolated non-skin blocks.
+  for (i = mi_row; i < mi_row_limit; i += fac) {
+    for (j = mi_col; j < mi_col_limit; j += fac) {
+      int bl_index = i * cm->mi_cols + j;
+      int num_neighbor = 0;
+      int mi, mj;
+      int non_skin_threshold = 8;
+      // Skip 4 corners.
+      if ((i == mi_row && (j == mi_col || j == mi_col_limit - fac)) ||
+          (i == mi_row_limit - fac && (j == mi_col || j == mi_col_limit - fac)))
+        continue;
+      // There are only 5 neighbors for non-skin blocks on the border.
+      if (i == mi_row || i == mi_row_limit - fac || j == mi_col ||
+          j == mi_col_limit - fac)
+        non_skin_threshold = 5;
+
+      for (mi = -fac; mi <= fac; mi += fac) {
+        for (mj = -fac; mj <= fac; mj += fac) {
+          if (i + mi >= mi_row && i + mi < mi_row_limit && j + mj >= mi_col &&
+              j + mj < mi_col_limit) {
+            int bl_neighbor_index = (i + mi) * cm->mi_cols + j + mj;
+            if (cpi->skin_map[bl_neighbor_index]) num_neighbor++;
+          }
+        }
+      }
+
+      if (cpi->skin_map[bl_index] && num_neighbor < 2)
+        cpi->skin_map[bl_index] = 0;
+      if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold)
+        cpi->skin_map[bl_index] = 1;
+    }
+  }
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp9_output_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
+  int i, j, mi_row, mi_col, num_bl;
+  VP9_COMMON *const cm = &cpi->common;
+  uint8_t *y;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  const int y_bsize = 16;  // Use 8x8 or 16x16.
+  const int shy = (y_bsize == 8) ? 3 : 4;
+  const int fac = y_bsize / 8;
+
   YV12_BUFFER_CONFIG skinmap;
   memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG));
   if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height, cm->subsampling_x,
@@ -141,65 +154,21 @@ void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
   for (mi_row = 0; mi_row < cm->mi_rows - 1; mi_row += fac) {
     num_bl = 0;
     for (mi_col = 0; mi_col < cm->mi_cols - 1; mi_col += fac) {
-      int is_skin = 0;
-      if (mode_filter == 1) {
-        // Use 2x2 average at center.
-        uint8_t ysource = src_y[ypos * src_ystride + ypos];
-        uint8_t usource = src_u[uvpos * src_uvstride + uvpos];
-        uint8_t vsource = src_v[uvpos * src_uvstride + uvpos];
-        uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos];
-        uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos];
-        uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos];
-        uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)];
-        uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos + 1)];
-        uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos + 1)];
-        uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
-        uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos + 1)];
-        uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos + 1)];
-        ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2;
-        usource = (usource + usource2 + usource3 + usource4) >> 2;
-        vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2;
-        is_skin = vp9_skin_pixel(ysource, usource, vsource, 1);
-      } else {
-        int block_size = BLOCK_8X8;
-        int consec_zeromv = 0;
-        int bl_index = mi_row * cm->mi_cols + mi_col;
-        int bl_index1 = bl_index + 1;
-        int bl_index2 = bl_index + cm->mi_cols;
-        int bl_index3 = bl_index2 + 1;
-        if (y_bsize == 8)
-          consec_zeromv = cpi->consec_zero_mv[bl_index];
-        else
-          consec_zeromv =
-              VPXMIN(cpi->consec_zero_mv[bl_index],
-                     VPXMIN(cpi->consec_zero_mv[bl_index1],
-                            VPXMIN(cpi->consec_zero_mv[bl_index2],
-                                   cpi->consec_zero_mv[bl_index3])));
-        if (y_bsize == 16) block_size = BLOCK_16X16;
-        is_skin =
-            vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
-                                   src_uvstride, block_size, consec_zeromv, 0);
-      }
+      const int block_index = mi_row * cm->mi_cols + mi_col;
+      const int is_skin = cpi->skin_map[block_index];
       for (i = 0; i < y_bsize; i++) {
         for (j = 0; j < y_bsize; j++) {
-          if (is_skin)
-            y[i * src_ystride + j] = 255;
-          else
-            y[i * src_ystride + j] = src_y[i * src_ystride + j];
+          y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j];
         }
       }
       num_bl++;
       y += y_bsize;
       src_y += y_bsize;
-      src_u += uv_bsize;
-      src_v += uv_bsize;
     }
     y += (src_ystride << shy) - (num_bl << shy);
     src_y += (src_ystride << shy) - (num_bl << shy);
-    src_u += (src_uvstride << shuv) - (num_bl << shuv);
-    src_v += (src_uvstride << shuv) - (num_bl << shuv);
   }
-  vp9_write_yuv_frame_420(&skinmap, yuv_skinmap_file);
+  vpx_write_yuv_frame(yuv_skinmap_file, &skinmap);
   vpx_free_frame_buffer(&skinmap);
 }
 #endif
diff --git a/libs/libvpx/vp9/encoder/vp9_skin_detection.h b/libs/libvpx/vp9/encoder/vp9_skin_detection.h
index c77382dbd7..8880bff466 100644
--- a/libs/libvpx/vp9/encoder/vp9_skin_detection.h
+++ b/libs/libvpx/vp9/encoder/vp9_skin_detection.h
@@ -12,6 +12,8 @@
 #define VP9_ENCODER_VP9_SKIN_MAP_H_
 
 #include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -19,19 +21,16 @@ extern "C" {
 
 struct VP9_COMP;
 
-// #define OUTPUT_YUV_SKINMAP
-
-int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr,
-                   int motion);
-
 int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
                            int stride, int strideuv, int bsize,
                            int consec_zeromv, int curr_motion_magn);
 
+void vp9_compute_skin_sb(struct VP9_COMP *const cpi, BLOCK_SIZE bsize,
+                         int mi_row, int mi_col);
+
 #ifdef OUTPUT_YUV_SKINMAP
 // For viewing skin map on input source.
-void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
-extern void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f);
+void vp9_output_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
 #endif
 
 #ifdef __cplusplus
diff --git a/libs/libvpx/vp9/encoder/vp9_speed_features.c b/libs/libvpx/vp9/encoder/vp9_speed_features.c
index c1cf48fa7b..a05db60c65 100644
--- a/libs/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/libs/libvpx/vp9/encoder/vp9_speed_features.c
@@ -20,19 +20,14 @@ static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = {
   { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 }
 };
 
-#define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
+// Define 3 mesh density levels to control the number of searches.
+#define MESH_DENSITY_LEVELS 3
 static MESH_PATTERN
-    good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
-      { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+    good_quality_mesh_patterns[MESH_DENSITY_LEVELS][MAX_MESH_STEP] = {
       { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
       { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } },
       { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
-      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
-      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
     };
-static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = {
-  50, 25, 15, 5, 1, 1
-};
 
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
@@ -67,14 +62,26 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
                                                        int speed) {
   VP9_COMMON *const cm = &cpi->common;
 
+  // speed 0 features
+  sf->partition_search_breakout_thr.dist = (1 << 20);
+  sf->partition_search_breakout_thr.rate = 80;
+
+  // Currently, the machine-learning based partition search early termination
+  // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0.
+  if (VPXMIN(cm->width, cm->height) >= 480) {
+    sf->ml_partition_search_early_termination = 1;
+  }
+
   if (speed >= 1) {
+    sf->ml_partition_search_early_termination = 0;
+
     if (VPXMIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask =
           cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-      sf->partition_search_breakout_dist_thr = (1 << 23);
+      sf->partition_search_breakout_thr.dist = (1 << 23);
     } else {
       sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
-      sf->partition_search_breakout_dist_thr = (1 << 21);
+      sf->partition_search_breakout_thr.dist = (1 << 21);
     }
   }
 
@@ -83,12 +90,12 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
       sf->disable_split_mask =
           cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
       sf->adaptive_pred_interp_filter = 0;
-      sf->partition_search_breakout_dist_thr = (1 << 24);
-      sf->partition_search_breakout_rate_thr = 120;
+      sf->partition_search_breakout_thr.dist = (1 << 24);
+      sf->partition_search_breakout_thr.rate = 120;
     } else {
       sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-      sf->partition_search_breakout_dist_thr = (1 << 22);
-      sf->partition_search_breakout_rate_thr = 100;
+      sf->partition_search_breakout_thr.dist = (1 << 22);
+      sf->partition_search_breakout_thr.rate = 100;
     }
     sf->rd_auto_partition_min_limit = set_partition_min_limit(cm);
 
@@ -108,14 +115,14 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
     if (VPXMIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
       sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
-      sf->partition_search_breakout_dist_thr = (1 << 25);
-      sf->partition_search_breakout_rate_thr = 200;
+      sf->partition_search_breakout_thr.dist = (1 << 25);
+      sf->partition_search_breakout_thr.rate = 200;
     } else {
       sf->max_intra_bsize = BLOCK_32X32;
       sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
       sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
-      sf->partition_search_breakout_dist_thr = (1 << 23);
-      sf->partition_search_breakout_rate_thr = 120;
+      sf->partition_search_breakout_thr.dist = (1 << 23);
+      sf->partition_search_breakout_thr.rate = 120;
     }
   }
 
@@ -129,33 +136,54 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
   }
 
   if (speed >= 4) {
+    sf->partition_search_breakout_thr.rate = 300;
     if (VPXMIN(cm->width, cm->height) >= 720) {
-      sf->partition_search_breakout_dist_thr = (1 << 26);
+      sf->partition_search_breakout_thr.dist = (1 << 26);
     } else {
-      sf->partition_search_breakout_dist_thr = (1 << 24);
+      sf->partition_search_breakout_thr.dist = (1 << 24);
     }
     sf->disable_split_mask = DISABLE_ALL_SPLIT;
   }
+
+  if (speed >= 5) {
+    sf->partition_search_breakout_thr.rate = 500;
+  }
 }
 
 static double tx_dom_thresholds[6] = { 99.0, 14.0, 12.0, 8.0, 4.0, 0.0 };
 static double qopt_thresholds[6] = { 99.0, 12.0, 10.0, 4.0, 2.0, 0.0 };
 
-static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
-                                   SPEED_FEATURES *sf, int speed) {
+static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
+                                                         VP9_COMMON *cm,
+                                                         SPEED_FEATURES *sf,
+                                                         int speed) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const int boosted = frame_is_boosted(cpi);
+  int i;
 
-  sf->partition_search_breakout_dist_thr = (1 << 20);
-  sf->partition_search_breakout_rate_thr = 80;
   sf->tx_size_search_breakout = 1;
   sf->adaptive_rd_thresh = 1;
+  sf->adaptive_rd_thresh_row_mt = 0;
   sf->allow_skip_recode = 1;
   sf->less_rectangular_check = 1;
   sf->use_square_partition_only = !frame_is_boosted(cpi);
   sf->use_square_only_threshold = BLOCK_16X16;
 
+  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+    sf->exhaustive_searches_thresh = (1 << 22);
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      int mesh_density_level = 0;
+      sf->mesh_patterns[i].range =
+          good_quality_mesh_patterns[mesh_density_level][i].range;
+      sf->mesh_patterns[i].interval =
+          good_quality_mesh_patterns[mesh_density_level][i].interval;
+    }
+  } else {
+    sf->exhaustive_searches_thresh = INT_MAX;
+  }
+
   if (speed >= 1) {
-    if (cpi->oxcf.pass == 2) {
+    if (oxcf->pass == 2) {
       TWO_PASS *const twopass = &cpi->twopass;
       if ((twopass->fr_content_type == FC_GRAPHICS_ANIMATION) ||
           vp9_internal_image_edge(cpi)) {
@@ -182,6 +210,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->mv.subpel_iters_per_step = 1;
     sf->mode_skip_start = 10;
     sf->adaptive_pred_interp_filter = 1;
+    sf->allow_acl = 0;
 
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
@@ -190,27 +219,44 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
 
     sf->recode_tolerance_low = 15;
     sf->recode_tolerance_high = 30;
+
+    sf->exhaustive_searches_thresh =
+        (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 23)
+                                                                : INT_MAX;
   }
 
   if (speed >= 2) {
-    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+    if (oxcf->vbr_corpus_complexity)
+      sf->recode_loop = ALLOW_RECODE_FIRST;
+    else
+      sf->recode_loop = ALLOW_RECODE_KFARFGF;
+
     sf->tx_size_search_method =
         frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
 
     // Reference masking is not supported in dynamic scaling mode.
-    sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0;
+    sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC ? 1 : 0;
 
     sf->mode_search_skip_flags =
-        (cm->frame_type == KEY_FRAME) ? 0 : FLAG_SKIP_INTRA_DIRMISMATCH |
-                                                FLAG_SKIP_INTRA_BESTINTER |
-                                                FLAG_SKIP_COMP_BESTINTRA |
-                                                FLAG_SKIP_INTRA_LOWVAR;
+        (cm->frame_type == KEY_FRAME)
+            ? 0
+            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
     sf->disable_filter_search_var_thresh = 100;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->allow_partition_search_skip = 1;
     sf->recode_tolerance_low = 15;
     sf->recode_tolerance_high = 45;
+
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+      for (i = 0; i < MAX_MESH_STEP; ++i) {
+        int mesh_density_level = 1;
+        sf->mesh_patterns[i].range =
+            good_quality_mesh_patterns[mesh_density_level][i].range;
+        sf->mesh_patterns[i].interval =
+            good_quality_mesh_patterns[mesh_density_level][i].interval;
+      }
+    }
   }
 
   if (speed >= 3) {
@@ -229,6 +275,17 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
     sf->adaptive_interp_filter_search = 1;
+    sf->allow_partition_search_skip = 1;
+
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+      for (i = 0; i < MAX_MESH_STEP; ++i) {
+        int mesh_density_level = 2;
+        sf->mesh_patterns[i].range =
+            good_quality_mesh_patterns[mesh_density_level][i].range;
+        sf->mesh_patterns[i].interval =
+            good_quality_mesh_patterns[mesh_density_level][i].interval;
+      }
+    }
   }
 
   if (speed >= 4) {
@@ -244,7 +301,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
     sf->use_fast_coef_costing = 1;
     sf->motion_field_mode_search = !boosted;
-    sf->partition_search_breakout_rate_thr = 300;
   }
 
   if (speed >= 5) {
@@ -256,7 +312,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
       sf->intra_y_mode_mask[i] = INTRA_DC;
       sf->intra_uv_mode_mask[i] = INTRA_DC;
     }
-    sf->partition_search_breakout_rate_thr = 500;
     sf->mv.reduce_first_step_size = 1;
     sf->simple_model_rd_from_var = 1;
   }
@@ -286,10 +341,11 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
   }
 
   if (speed >= 5) {
+    sf->partition_search_breakout_thr.rate = 200;
     if (VPXMIN(cm->width, cm->height) >= 720) {
-      sf->partition_search_breakout_dist_thr = (1 << 25);
+      sf->partition_search_breakout_thr.dist = (1 << 25);
     } else {
-      sf->partition_search_breakout_dist_thr = (1 << 23);
+      sf->partition_search_breakout_thr.dist = (1 << 23);
     }
   }
 
@@ -299,16 +355,25 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
   }
 }
 
-static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
-                                 vp9e_tune_content content) {
+static void set_rt_speed_feature_framesize_independent(
+    VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, vp9e_tune_content content) {
   VP9_COMMON *const cm = &cpi->common;
   const int is_keyframe = cm->frame_type == KEY_FRAME;
   const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
   sf->static_segmentation = 0;
   sf->adaptive_rd_thresh = 1;
+  sf->adaptive_rd_thresh_row_mt = 0;
   sf->use_fast_coef_costing = 1;
-  sf->allow_exhaustive_searches = 0;
   sf->exhaustive_searches_thresh = INT_MAX;
+  sf->allow_acl = 0;
+  sf->copy_partition_flag = 0;
+  sf->use_source_sad = 0;
+  sf->use_simple_block_yrd = 0;
+  sf->adapt_partition_source_sad = 0;
+  sf->use_altref_onepass = 0;
+  sf->use_compound_nonrd_pickmode = 0;
+  sf->nonrd_keyframe = 0;
+  sf->svc_use_lowres_part = 0;
 
   if (speed >= 1) {
     sf->allow_txfm_domain_distortion = 1;
@@ -333,10 +398,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
 
   if (speed >= 2) {
     sf->mode_search_skip_flags =
-        (cm->frame_type == KEY_FRAME) ? 0 : FLAG_SKIP_INTRA_DIRMISMATCH |
-                                                FLAG_SKIP_INTRA_BESTINTER |
-                                                FLAG_SKIP_COMP_BESTINTRA |
-                                                FLAG_SKIP_INTRA_LOWVAR;
+        (cm->frame_type == KEY_FRAME)
+            ? 0
+            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
     sf->adaptive_pred_interp_filter = 2;
 
     // Reference masking only enabled for 1 spatial layer, and if none of the
@@ -386,6 +451,8 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
 
   if (speed >= 4) {
     int i;
+    if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0)
+      sf->use_altref_onepass = 1;
     sf->last_partitioning_redo_frequency = 4;
     sf->adaptive_rd_thresh = 5;
     sf->use_fast_coef_costing = 0;
@@ -411,6 +478,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
   }
 
   if (speed >= 5) {
+    sf->use_altref_onepass = 0;
     sf->use_quant_fp = !is_keyframe;
     sf->auto_min_max_partition_size =
         is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
@@ -421,6 +489,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
         (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
     sf->max_delta_qindex = is_keyframe ? 20 : 15;
     sf->partition_search_type = REFERENCE_PARTITION;
+    if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 &&
+        cpi->rc.is_src_frame_alt_ref) {
+      sf->partition_search_type = VAR_BASED_PARTITION;
+    }
     sf->use_nonrd_pick_mode = 1;
     sf->allow_skip_recode = 0;
     sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO;
@@ -430,7 +502,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
     sf->adaptive_rd_thresh = 2;
     // This feature is only enabled when partition search is disabled.
     sf->reuse_inter_pred_sby = 1;
-    sf->partition_search_breakout_rate_thr = 200;
     sf->coeff_prob_appx_step = 4;
     sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED;
     sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH;
@@ -456,42 +527,103 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
       sf->short_circuit_flat_blocks = 1;
     }
     if (cpi->oxcf.rc_mode == VPX_CBR &&
-        cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) {
+        cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
       sf->limit_newmv_early_exit = 1;
-      sf->bias_golden = 1;
+      if (!cpi->use_svc) sf->bias_golden = 1;
     }
   }
 
   if (speed >= 6) {
+    if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) {
+      sf->use_altref_onepass = 1;
+      sf->use_compound_nonrd_pickmode = 1;
+    }
     sf->partition_search_type = VAR_BASED_PARTITION;
     // Turn on this to use non-RD key frame coding mode.
     sf->use_nonrd_pick_mode = 1;
     sf->mv.search_method = NSTEP;
     sf->mv.reduce_first_step_size = 1;
     sf->skip_encode_sb = 0;
-    if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR &&
-        content != VP9E_CONTENT_SCREEN) {
+
+    if (!cpi->external_resize) sf->use_source_sad = 1;
+
+    if (sf->use_source_sad) {
+      sf->adapt_partition_source_sad = 1;
+      sf->adapt_partition_thresh =
+          (cm->width * cm->height <= 640 * 360) ? 40000 : 60000;
+      if (cpi->content_state_sb_fd == NULL &&
+          (!cpi->use_svc ||
+           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+        cpi->content_state_sb_fd = (uint8_t *)vpx_calloc(
+            (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t));
+      }
+    }
+    if (cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) {
       // Enable short circuit for low temporal variance.
       sf->short_circuit_low_temp_var = 1;
     }
-    if (cpi->use_svc) sf->base_mv_aggressive = 1;
+    if (cpi->svc.temporal_layer_id > 0) {
+      sf->adaptive_rd_thresh = 4;
+      sf->limit_newmv_early_exit = 0;
+      sf->base_mv_aggressive = 1;
+    }
   }
 
   if (speed >= 7) {
+    sf->adapt_partition_source_sad = 0;
     sf->adaptive_rd_thresh = 3;
     sf->mv.search_method = FAST_DIAMOND;
     sf->mv.fullpel_search_step_param = 10;
+    // For SVC: use better mv search on base temporal layer, and only
+    // on base spatial layer if highest resolution is above 640x360.
     if (cpi->svc.number_temporal_layers > 2 &&
-        cpi->svc.temporal_layer_id == 0) {
+        cpi->svc.temporal_layer_id == 0 &&
+        (cpi->svc.spatial_layer_id == 0 ||
+         cpi->oxcf.width * cpi->oxcf.height <= 640 * 360)) {
       sf->mv.search_method = NSTEP;
       sf->mv.fullpel_search_step_param = 6;
     }
+    if (cpi->svc.temporal_layer_id > 0 || cpi->svc.spatial_layer_id > 1) {
+      sf->use_simple_block_yrd = 1;
+      if (cpi->svc.non_reference_frame)
+        sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE;
+    }
+    if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1)
+      sf->adaptive_rd_thresh_row_mt = 1;
+    // Enable partition copy. For SVC only enabled for top spatial resolution
+    // layer.
+    cpi->max_copied_frame = 0;
+    if (!cpi->last_frame_dropped && cpi->resize_state == ORIG &&
+        !cpi->external_resize &&
+        (!cpi->use_svc ||
+         cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+      sf->copy_partition_flag = 1;
+      cpi->max_copied_frame = 2;
+      // The top temporal enhancement layer (for number of temporal layers > 1)
+      // are non-reference frames, so use large/max value for max_copied_frame.
+      if (cpi->svc.number_temporal_layers > 1 &&
+          cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1)
+        cpi->max_copied_frame = 255;
+    }
+    // For SVC: enable use of lower resolution partition for higher resolution,
+    // only for 3 spatial layers and when config/top resolution is above VGA.
+    // Enable only for non-base temporal layer frames.
+    if (cpi->use_svc && cpi->svc.number_spatial_layers == 3 &&
+        cpi->svc.temporal_layer_id > 0 &&
+        cpi->oxcf.width * cpi->oxcf.height > 640 * 480)
+      sf->svc_use_lowres_part = 1;
   }
 
   if (speed >= 8) {
     sf->adaptive_rd_thresh = 4;
-    sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2;
-    sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
+    sf->skip_encode_sb = 1;
+    sf->nonrd_keyframe = 1;
+    if (!cpi->use_svc) cpi->max_copied_frame = 4;
+    if (cpi->row_mt && cpi->oxcf.max_threads > 1)
+      sf->adaptive_rd_thresh_row_mt = 1;
+
+    if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = 3;
+    if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
     // Only keep INTRA_DC mode for speed 8.
     if (!is_keyframe) {
       int i = 0;
@@ -501,10 +633,38 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
     if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR &&
         content != VP9E_CONTENT_SCREEN) {
       // More aggressive short circuit for speed 8.
-      sf->short_circuit_low_temp_var = 2;
+      sf->short_circuit_low_temp_var = 3;
+      // Use level 2 for noisey cases as there is a regression in some
+      // noisy clips with level 3.
+      if (cpi->noise_estimate.enabled && cm->width >= 1280 &&
+          cm->height >= 720) {
+        NOISE_LEVEL noise_level =
+            vp9_noise_estimate_extract_level(&cpi->noise_estimate);
+        if (noise_level >= kMedium) sf->short_circuit_low_temp_var = 2;
+      }
+      // Since the short_circuit_low_temp_var is used, reduce the
+      // adaptive_rd_thresh level.
+      if (cm->width * cm->height > 352 * 288)
+        sf->adaptive_rd_thresh = 1;
+      else
+        sf->adaptive_rd_thresh = 2;
     }
     sf->limit_newmv_early_exit = 0;
-    sf->bias_golden = 0;
+    sf->use_simple_block_yrd = 1;
+  }
+  if (sf->use_altref_onepass) {
+    if (cpi->rc.is_src_frame_alt_ref && cm->frame_type != KEY_FRAME) {
+      sf->partition_search_type = FIXED_PARTITION;
+      sf->always_this_block_size = BLOCK_64X64;
+    }
+    if (cpi->count_arf_frame_usage == NULL)
+      cpi->count_arf_frame_usage =
+          (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+                                sizeof(*cpi->count_arf_frame_usage));
+    if (cpi->count_lastgolden_frame_usage == NULL)
+      cpi->count_lastgolden_frame_usage =
+          (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+                                sizeof(*cpi->count_lastgolden_frame_usage));
   }
 }
 
@@ -514,6 +674,12 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
   RD_OPT *const rd = &cpi->rd;
   int i;
 
+  // best quality defaults
+  // Some speed-up features even for best quality as minimal impact on quality.
+  sf->partition_search_breakout_thr.dist = (1 << 19);
+  sf->partition_search_breakout_thr.rate = 80;
+  sf->ml_partition_search_early_termination = 0;
+
   if (oxcf->mode == REALTIME) {
     set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
   } else if (oxcf->mode == GOOD) {
@@ -535,6 +701,21 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
       rd->thresh_mult_sub8x8[i] = INT_MAX;
     }
   }
+
+  // With row based multi-threading, the following speed features
+  // have to be disabled to guarantee that bitstreams encoded with single thread
+  // and multiple threads match.
+  // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since
+  // adaptive_rd_thresh is defined per-row for non-rd pickmode.
+  if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact &&
+      oxcf->max_threads > 1)
+    sf->adaptive_rd_thresh = 0;
+
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test == 1)
+    cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv;
+  else if (cpi->oxcf.motion_vector_unit_test == 2)
+    cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv;
 }
 
 void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
@@ -589,6 +770,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->tx_domain_thresh = 99.0;
   sf->allow_quant_coeff_opt = sf->optimize_coefficients;
   sf->quant_opt_thresh = 99.0;
+  sf->allow_acl = 1;
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
@@ -626,45 +808,25 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   // Some speed-up features even for best quality as minimal impact on quality.
   sf->adaptive_rd_thresh = 1;
   sf->tx_size_search_breakout = 1;
-  sf->partition_search_breakout_dist_thr = (1 << 19);
-  sf->partition_search_breakout_rate_thr = 80;
 
-  if (oxcf->mode == REALTIME)
-    set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
-  else if (oxcf->mode == GOOD)
-    set_good_speed_feature(cpi, cm, sf, oxcf->speed);
-
-  cpi->full_search_sad = vp9_full_search_sad;
-  cpi->diamond_search_sad = vp9_diamond_search_sad;
-
-  sf->allow_exhaustive_searches = 1;
-  if (oxcf->mode == BEST) {
-    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
-      sf->exhaustive_searches_thresh = (1 << 20);
-    else
-      sf->exhaustive_searches_thresh = (1 << 21);
-    sf->max_exaustive_pct = 100;
+  sf->exhaustive_searches_thresh =
+      (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
+                                                              : INT_MAX;
+  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
     for (i = 0; i < MAX_MESH_STEP; ++i) {
       sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
       sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
     }
-  } else {
-    int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
-    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
-      sf->exhaustive_searches_thresh = (1 << 22);
-    else
-      sf->exhaustive_searches_thresh = (1 << 23);
-    sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
-    if (speed > 0)
-      sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
-
-    for (i = 0; i < MAX_MESH_STEP; ++i) {
-      sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range;
-      sf->mesh_patterns[i].interval =
-          good_quality_mesh_patterns[speed][i].interval;
-    }
   }
 
+  if (oxcf->mode == REALTIME)
+    set_rt_speed_feature_framesize_independent(cpi, sf, oxcf->speed,
+                                               oxcf->content);
+  else if (oxcf->mode == GOOD)
+    set_good_speed_feature_framesize_independent(cpi, cm, sf, oxcf->speed);
+
+  cpi->diamond_search_sad = vp9_diamond_search_sad;
+
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
   if (oxcf->pass == 1) sf->optimize_coefficients = 0;
@@ -696,4 +858,19 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   if (!cpi->oxcf.frame_periodic_boost) {
     sf->max_delta_qindex = 0;
   }
+
+  // With row based multi-threading, the following speed features
+  // have to be disabled to guarantee that bitstreams encoded with single thread
+  // and multiple threads match.
+  // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since
+  // adaptive_rd_thresh is defined per-row for non-rd pickmode.
+  if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact &&
+      oxcf->max_threads > 1)
+    sf->adaptive_rd_thresh = 0;
+
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test == 1)
+    cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv;
+  else if (cpi->oxcf.motion_vector_unit_test == 2)
+    cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv;
 }
diff --git a/libs/libvpx/vp9/encoder/vp9_speed_features.h b/libs/libvpx/vp9/encoder/vp9_speed_features.h
index 6d0b9420a1..50d52bc23a 100644
--- a/libs/libvpx/vp9/encoder/vp9_speed_features.h
+++ b/libs/libvpx/vp9/encoder/vp9_speed_features.h
@@ -193,6 +193,11 @@ typedef struct MV_SPEED_FEATURES {
   int fullpel_search_step_param;
 } MV_SPEED_FEATURES;
 
+typedef struct PARTITION_SEARCH_BREAKOUT_THR {
+  int64_t dist;
+  int rate;
+} PARTITION_SEARCH_BREAKOUT_THR;
+
 #define MAX_MESH_STEP 4
 
 typedef struct MESH_PATTERN {
@@ -226,8 +231,13 @@ typedef struct SPEED_FEATURES {
 
   // This variable is used to cap the maximum number of times we skip testing a
   // mode to be evaluated. A high value means we will be faster.
+  // Turned off when (row_mt_bit_exact == 1 && adaptive_rd_thresh_row_mt == 0).
   int adaptive_rd_thresh;
 
+  // Flag to use adaptive_rd_thresh when row-mt it enabled, only for non-rd
+  // pickmode.
+  int adaptive_rd_thresh_row_mt;
+
   // Enables skipping the reconstruction step (idct, recon) in the
   // intermediate steps assuming the last frame didn't have too many intra
   // blocks and the q is less than a threshold.
@@ -244,6 +254,10 @@ typedef struct SPEED_FEATURES {
   int allow_quant_coeff_opt;
   double quant_opt_thresh;
 
+  // Enable asymptotic closed-loop encoding decision for key frame and
+  // alternate reference frames.
+  int allow_acl;
+
   // Use transform domain distortion. Use pixel domain distortion in speed 0
   // and certain situations in higher speed to improve the RD model precision.
   int allow_txfm_domain_distortion;
@@ -313,15 +327,9 @@ typedef struct SPEED_FEATURES {
   // point for this motion search and limits the search range around it.
   int adaptive_motion_search;
 
-  // Flag for allowing some use of exhaustive searches;
-  int allow_exhaustive_searches;
-
   // Threshold for allowing exhaistive motion search.
   int exhaustive_searches_thresh;
 
-  // Maximum number of exhaustive searches for a frame.
-  int max_exaustive_pct;
-
   // Pattern to be used for any exhaustive mesh searches.
   MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
 
@@ -438,8 +446,10 @@ typedef struct SPEED_FEATURES {
   INTERP_FILTER_MASK interp_filter_search_mask;
 
   // Partition search early breakout thresholds.
-  int64_t partition_search_breakout_dist_thr;
-  int partition_search_breakout_rate_thr;
+  PARTITION_SEARCH_BREAKOUT_THR partition_search_breakout_thr;
+
+  // Machine-learning based partition search early termination
+  int ml_partition_search_early_termination;
 
   // Allow skipping partition search for still image frame
   int allow_partition_search_skip;
@@ -452,11 +462,13 @@ typedef struct SPEED_FEATURES {
   int short_circuit_flat_blocks;
 
   // Skip a number of expensive mode evaluations for blocks with very low
-  // temporal variance.
-  // 1: Skip golden non-zeromv and ALL INTRA for bsize >= 32x32.
+  // temporal variance. If the low temporal variance flag is set for a block,
+  // do the following:
+  // 1: Skip all golden modes and ALL INTRA for bsize >= 32x32.
   // 2: Skip golden non-zeromv and newmv-last for bsize >= 16x16, skip ALL
   // INTRA for bsize >= 32x32 and vert/horz INTRA for bsize 16x16, 16x32 and
   // 32x16.
+  // 3: Same as (2), but also skip golden zeromv.
   int short_circuit_low_temp_var;
 
   // Limits the rd-threshold update for early exit for the newmv-last mode,
@@ -469,6 +481,33 @@ typedef struct SPEED_FEATURES {
   // Bias to use base mv and skip 1/4 subpel search when use base mv in
   // enhancement layer.
   int base_mv_aggressive;
+
+  // Global flag to enable partition copy from the previous frame.
+  int copy_partition_flag;
+
+  // Compute the source sad for every superblock of the frame,
+  // prior to encoding the frame, to be used to bypass some encoder decisions.
+  int use_source_sad;
+
+  int use_simple_block_yrd;
+
+  // If source sad of superblock is high (> adapt_partition_thresh), will switch
+  // from VARIANCE_PARTITION to REFERENCE_PARTITION (which selects partition
+  // based on the nonrd-pickmode).
+  int adapt_partition_source_sad;
+  int adapt_partition_thresh;
+
+  // Enable use of alt-refs in 1 pass VBR.
+  int use_altref_onepass;
+
+  // Enable use of compound prediction, for nonrd_pickmode with nonzero lag.
+  int use_compound_nonrd_pickmode;
+
+  // Always use nonrd_pick_intra for all block sizes on keyframes.
+  int nonrd_keyframe;
+
+  // For SVC: enables use of partition from lower spatial resolution.
+  int svc_use_lowres_part;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
diff --git a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c
index 2d29e268b1..2636bd9a58 100644
--- a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -36,12 +36,16 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
   svc->scaled_temp_is_alloc = 0;
   svc->scaled_one_half = 0;
   svc->current_superframe = 0;
+  svc->non_reference_frame = 0;
+
   for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1;
   for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
-    cpi->svc.ext_frame_flags[sl] = 0;
-    cpi->svc.ext_lst_fb_idx[sl] = 0;
-    cpi->svc.ext_gld_fb_idx[sl] = 1;
-    cpi->svc.ext_alt_fb_idx[sl] = 2;
+    svc->ext_frame_flags[sl] = 0;
+    svc->ext_lst_fb_idx[sl] = 0;
+    svc->ext_gld_fb_idx[sl] = 1;
+    svc->ext_alt_fb_idx[sl] = 2;
+    svc->downsample_filter_type[sl] = EIGHTTAP;
+    svc->downsample_filter_phase[sl] = 0;  // Set to 8 for averaging filter.
   }
 
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
@@ -171,7 +175,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
         RATE_CONTROL *const lrc = &lc->rc;
 
         lc->spatial_layer_target_bandwidth = spatial_layer_target;
-        bitrate_alloc = (float)lc->target_bandwidth / spatial_layer_target;
+        bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
         lrc->starting_buffer_level =
             (int64_t)(rc->starting_buffer_level * bitrate_alloc);
         lrc->optimal_buffer_level =
@@ -349,6 +353,7 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
   SVC *const svc = &cpi->svc;
   int i;
@@ -364,6 +369,7 @@ void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
   }
   svc->spatial_layer_id = 0;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 void vp9_inc_frame_in_layer(VP9_COMP *const cpi) {
   LAYER_CONTEXT *const lc =
@@ -384,9 +390,9 @@ int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
              .is_key_frame;
 }
 
-static void get_layer_resolution(const int width_org, const int height_org,
-                                 const int num, const int den, int *width_out,
-                                 int *height_out) {
+void get_layer_resolution(const int width_org, const int height_org,
+                          const int num, const int den, int *width_out,
+                          int *height_out) {
   int w, h;
 
   if (width_out == NULL || height_out == NULL || den == 0) return;
@@ -601,6 +607,7 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
   LAYER_CONTEXT *lc = NULL;
   if (cpi->svc.number_spatial_layers > 1) cpi->svc.use_base_mv = 1;
   cpi->svc.force_zero_mode_spatial_ref = 1;
+  cpi->svc.mi_stride[cpi->svc.spatial_layer_id] = cpi->common.mi_stride;
 
   if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
     set_flags_and_fb_idx_for_temporal_mode3(cpi);
@@ -650,6 +657,37 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
                        lc->scaling_factor_num, lc->scaling_factor_den, &width,
                        &height);
 
+  // For resolutions <= VGA: set phase of the filter = 8 (for symmetric
+  // averaging filter), use bilinear for now.
+  if (width * height <= 640 * 480) {
+    cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR;
+    cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8;
+  }
+
+  // The usage of use_base_mv assumes down-scale of 2x2. For now, turn off use
+  // of base motion vectors if spatial scale factors for any layers are not 2,
+  // keep the case of 3 spatial layers with scale factor of 4x4 for base layer.
+  // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2.
+  if (cpi->svc.number_spatial_layers > 1) {
+    int sl;
+    for (sl = 0; sl < cpi->svc.number_spatial_layers - 1; ++sl) {
+      lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers +
+                                   cpi->svc.temporal_layer_id];
+      if ((lc->scaling_factor_num != lc->scaling_factor_den >> 1) &&
+          !(lc->scaling_factor_num == lc->scaling_factor_den >> 2 && sl == 0 &&
+            cpi->svc.number_spatial_layers == 3)) {
+        cpi->svc.use_base_mv = 0;
+        break;
+      }
+    }
+  }
+
+  cpi->svc.non_reference_frame = 0;
+  if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame &&
+      !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) {
+    cpi->svc.non_reference_frame = 1;
+  }
+
   if (vp9_set_size_literal(cpi, width, height) != 0)
     return VPX_CODEC_INVALID_PARAM;
 
@@ -824,3 +862,28 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
   vp9_update_temporal_layer_framerate(cpi);
   vp9_restore_layer_context(cpi);
 }
+
+void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
+  SVC *svc = &cpi->svc;
+  int sl, tl;
+  for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    // Check for reset based on avg_frame_bandwidth for spatial layer sl.
+    int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
+                                 svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    RATE_CONTROL *lrc = &lc->rc;
+    if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) ||
+        lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) {
+      // Reset for all temporal layers with spatial layer sl.
+      for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+        int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+        LAYER_CONTEXT *lc = &svc->layer_context[layer];
+        RATE_CONTROL *lrc = &lc->rc;
+        lrc->rc_1_frame = 0;
+        lrc->rc_2_frame = 0;
+        lrc->bits_off_target = lrc->optimal_buffer_level;
+        lrc->buffer_level = lrc->optimal_buffer_level;
+      }
+    }
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h
index ee7a6638b4..b7cdfd9623 100644
--- a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h
+++ b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h
@@ -49,7 +49,7 @@ typedef struct {
   uint8_t speed;
 } LAYER_CONTEXT;
 
-typedef struct {
+typedef struct SVC {
   int spatial_layer_id;
   int temporal_layer_id;
   int number_spatial_layers;
@@ -87,7 +87,20 @@ typedef struct {
   int ref_frame_index[REF_FRAMES];
   int force_zero_mode_spatial_ref;
   int current_superframe;
+  int non_reference_frame;
   int use_base_mv;
+  // Used to control the downscaling filter for source scaling, for 1 pass CBR.
+  // downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
+  // = 8 will center the target pixel and get a symmetric averaging filter.
+  // downsample_filter_type: 4 filters may be used: eighttap_regular,
+  // eighttap_smooth, eighttap_sharp, and bilinear.
+  INTERP_FILTER downsample_filter_type[VPX_SS_MAX_LAYERS];
+  int downsample_filter_phase[VPX_SS_MAX_LAYERS];
+
+  BLOCK_SIZE *prev_partition_svc;
+  int mi_stride[VPX_MAX_LAYERS];
+
+  int first_layer_denoise;
 } SVC;
 
 struct VP9_COMP;
@@ -117,6 +130,10 @@ void vp9_save_layer_context(struct VP9_COMP *const cpi);
 // Initialize second pass rc for spatial svc.
 void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);
 
+void get_layer_resolution(const int width_org, const int height_org,
+                          const int num, const int den, int *width_out,
+                          int *height_out);
+
 // Increment number of video frames in layer
 void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi);
 
@@ -137,6 +154,8 @@ void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
 
 void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi);
 
+void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libs/libvpx/vp9/encoder/vp9_temporal_filter.c b/libs/libvpx/vp9/encoder/vp9_temporal_filter.c
index a167eeb15d..2758c42aeb 100644
--- a/libs/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libs/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -8,13 +8,17 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <math.h>
 #include <limits.h>
 
 #include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
@@ -51,16 +55,19 @@ static void temporal_filter_predictors_mb_c(
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale,
-                                     16, 16, which_mv, kernel, MV_PRECISION_Q3,
-                                     x, y, xd->bd);
+    vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(y_mb_ptr), stride,
+                                     CONVERT_TO_SHORTPTR(&pred[0]), 16, &mv,
+                                     scale, 16, 16, which_mv, kernel,
+                                     MV_PRECISION_Q3, x, y, xd->bd);
 
-    vp9_highbd_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256],
+    vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(u_mb_ptr), uv_stride,
+                                     CONVERT_TO_SHORTPTR(&pred[256]),
                                      uv_block_width, &mv, scale, uv_block_width,
                                      uv_block_height, which_mv, kernel,
                                      mv_precision_uv, x, y, xd->bd);
 
-    vp9_highbd_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512],
+    vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(v_mb_ptr), uv_stride,
+                                     CONVERT_TO_SHORTPTR(&pred[512]),
                                      uv_block_width, &mv, scale, uv_block_width,
                                      uv_block_height, which_mv, kernel,
                                      mv_precision_uv, x, y, xd->bd);
@@ -87,16 +94,23 @@ void vp9_temporal_filter_init(void) {
   for (i = 1; i < 512; ++i) fixed_divide[i] = 0x80000 / i;
 }
 
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
-                                 uint8_t *frame2, unsigned int block_width,
+void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride,
+                                 const uint8_t *frame2,
+                                 unsigned int block_width,
                                  unsigned int block_height, int strength,
-                                 int filter_weight, unsigned int *accumulator,
+                                 int filter_weight, uint32_t *accumulator,
                                  uint16_t *count) {
   unsigned int i, j, k;
   int modifier;
   int byte = 0;
   const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
 
+  assert(strength >= 0);
+  assert(strength <= 6);
+
+  assert(filter_weight >= 0);
+  assert(filter_weight <= 2);
+
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
       int pixel_value = *frame2;
@@ -150,11 +164,11 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_temporal_filter_apply_c(
-    uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8,
+    const uint8_t *frame1_8, unsigned int stride, const uint8_t *frame2_8,
     unsigned int block_width, unsigned int block_height, int strength,
-    int filter_weight, unsigned int *accumulator, uint16_t *count) {
-  uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
-  uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
+    int filter_weight, uint32_t *accumulator, uint16_t *count) {
+  const uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
+  const uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
   unsigned int i, j, k;
   int modifier;
   int byte = 0;
@@ -208,24 +222,25 @@ void vp9_highbd_temporal_filter_apply_c(
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
-                                              uint8_t *arf_frame_buf,
-                                              uint8_t *frame_ptr_buf,
-                                              int stride) {
-  MACROBLOCK *const x = &cpi->td.mb;
+static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
+                                                   ThreadData *td,
+                                                   uint8_t *arf_frame_buf,
+                                                   uint8_t *frame_ptr_buf,
+                                                   int stride, MV *ref_mv) {
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  const SEARCH_METHODS old_search_method = mv_sf->search_method;
+  const SEARCH_METHODS search_method = HEX;
   int step_param;
   int sadpb = x->sadperbit16;
-  int bestsme = INT_MAX;
+  uint32_t bestsme = UINT_MAX;
   uint32_t distortion;
   uint32_t sse;
   int cost_list[5];
+  const MvLimits tmp_mv_limits = x->mv_limits;
 
   MV best_ref_mv1 = { 0, 0 };
   MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-  MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv;
 
   // Save input state
   struct buf_2d src = x->plane[0].src;
@@ -243,11 +258,14 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   step_param = mv_sf->reduce_first_step_size;
   step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
 
-  mv_sf->search_method = HEX;
+  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
   vp9_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param,
-                        sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1,
-                        ref_mv, 0, 0);
-  mv_sf->search_method = old_search_method;
+                        search_method, sadpb, cond_cost_list(cpi, cost_list),
+                        &best_ref_mv1, ref_mv, 0, 0);
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
 
   // Ignore mv costing by sending NULL pointer instead of cost array
   bestsme = cpi->find_fractional_mv_step(
@@ -263,22 +281,24 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   return bestsme;
 }
 
-static void temporal_filter_iterate_c(VP9_COMP *cpi,
-                                      YV12_BUFFER_CONFIG **frames,
-                                      int frame_count, int alt_ref_index,
-                                      int strength,
-                                      struct scale_factors *scale) {
+void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
+                                       int mb_row, int mb_col_start,
+                                       int mb_col_end) {
+  ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data;
+  YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames;
+  int frame_count = arnr_filter_data->frame_count;
+  int alt_ref_index = arnr_filter_data->alt_ref_index;
+  int strength = arnr_filter_data->strength;
+  struct scale_factors *scale = &arnr_filter_data->sf;
   int byte;
   int frame;
-  int mb_col, mb_row;
+  int mb_col;
   unsigned int filter_weight;
   int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4;
   int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
-  int mb_y_offset = 0;
-  int mb_uv_offset = 0;
-  DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint32_t, accumulator[16 * 16 * 3]);
   DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
-  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+  MACROBLOCKD *mbd = &td->mb.e_mbd;
   YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
   uint8_t *dst1, *dst2;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -290,10 +310,11 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
 #endif
   const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
   const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
+  // Addition of the tile col level offsets
+  int mb_y_offset = mb_row * 16 * (f->y_stride) + 16 * mb_col_start;
+  int mb_uv_offset =
+      mb_row * mb_uv_height * f->uv_stride + mb_uv_width * mb_col_start;
 
-  // Save input state
-  uint8_t *input_buffer[MAX_MB_PLANE];
-  int i;
 #if CONFIG_VP9_HIGHBITDEPTH
   if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     predictor = CONVERT_TO_BYTEPTR(predictor16);
@@ -302,215 +323,186 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
   }
 #endif
 
-  for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
+  // Source frames are extended to 16 pixels. This is different than
+  //  L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS)
+  // A 6/8 tap filter is used for motion search.  This requires 2 pixels
+  //  before and 3 pixels after.  So the largest Y mv on a border would
+  //  then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the
+  //  Y and therefore only extended by 8.  The largest mv that a UV block
+  //  can support is 8 - VP9_INTERP_EXTEND.  A UV mv is half of a Y mv.
+  //  (16 - VP9_INTERP_EXTEND) >> 1 which is greater than
+  //  8 - VP9_INTERP_EXTEND.
+  // To keep the mv in play for both Y and UV planes the max that it
+  //  can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1).
+  td->mb.mv_limits.row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+  td->mb.mv_limits.row_max =
+      ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * VP9_INTERP_EXTEND);
 
-  for (mb_row = 0; mb_row < mb_rows; mb_row++) {
-    // Source frames are extended to 16 pixels. This is different than
-    //  L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS)
-    // A 6/8 tap filter is used for motion search.  This requires 2 pixels
-    //  before and 3 pixels after.  So the largest Y mv on a border would
-    //  then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the
-    //  Y and therefore only extended by 8.  The largest mv that a UV block
-    //  can support is 8 - VP9_INTERP_EXTEND.  A UV mv is half of a Y mv.
-    //  (16 - VP9_INTERP_EXTEND) >> 1 which is greater than
-    //  8 - VP9_INTERP_EXTEND.
-    // To keep the mv in play for both Y and UV planes the max that it
-    //  can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1).
-    cpi->td.mb.mv_limits.row_min =
-        -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
-    cpi->td.mb.mv_limits.row_max =
-        ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * VP9_INTERP_EXTEND);
+  for (mb_col = mb_col_start; mb_col < mb_col_end; mb_col++) {
+    int i, j, k;
+    int stride;
+    MV ref_mv;
 
-    for (mb_col = 0; mb_col < mb_cols; mb_col++) {
-      int i, j, k;
-      int stride;
+    vp9_zero_array(accumulator, 16 * 16 * 3);
+    vp9_zero_array(count, 16 * 16 * 3);
 
-      memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
-      memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
+    td->mb.mv_limits.col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+    td->mb.mv_limits.col_max =
+        ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND);
 
-      cpi->td.mb.mv_limits.col_min =
-          -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
-      cpi->td.mb.mv_limits.col_max =
-          ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND);
+    if (cpi->oxcf.content == VP9E_CONTENT_FILM) {
+      unsigned int src_variance;
+      struct buf_2d src;
 
-      for (frame = 0; frame < frame_count; frame++) {
-        const int thresh_low = 10000;
-        const int thresh_high = 20000;
-
-        if (frames[frame] == NULL) continue;
-
-        mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
-        mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0;
-
-        if (frame == alt_ref_index) {
-          filter_weight = 2;
-        } else {
-          // Find best match in this frame by MC
-          int err = temporal_filter_find_matching_mb_c(
-              cpi, frames[alt_ref_index]->y_buffer + mb_y_offset,
-              frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride);
-
-          // Assign higher weight to matching MB if its error
-          // score is lower. If not applying MC default behavior
-          // is to weight all MBs equal.
-          filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0;
-        }
-
-        if (filter_weight != 0) {
-          // Construct the predictors
-          temporal_filter_predictors_mb_c(
-              mbd, frames[frame]->y_buffer + mb_y_offset,
-              frames[frame]->u_buffer + mb_uv_offset,
-              frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
-              mb_uv_width, mb_uv_height, mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
-              mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, predictor, scale,
-              mb_col * 16, mb_row * 16);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-          if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-            int adj_strength = strength + 2 * (mbd->bd - 8);
-            // Apply the filter (YUV)
-            vp9_highbd_temporal_filter_apply_c(
-                f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
-                adj_strength, filter_weight, accumulator, count);
-            vp9_highbd_temporal_filter_apply_c(
-                f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
-                mb_uv_width, mb_uv_height, adj_strength, filter_weight,
-                accumulator + 256, count + 256);
-            vp9_highbd_temporal_filter_apply_c(
-                f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
-                mb_uv_width, mb_uv_height, adj_strength, filter_weight,
-                accumulator + 512, count + 512);
-          } else {
-            // Apply the filter (YUV)
-            vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
-                                        predictor, 16, 16, strength,
-                                        filter_weight, accumulator, count);
-            vp9_temporal_filter_apply_c(
-                f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
-                mb_uv_width, mb_uv_height, strength, filter_weight,
-                accumulator + 256, count + 256);
-            vp9_temporal_filter_apply_c(
-                f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
-                mb_uv_width, mb_uv_height, strength, filter_weight,
-                accumulator + 512, count + 512);
-          }
-#else
-          // Apply the filter (YUV)
-          // TODO(jingning): Need SIMD optimization for this.
-          vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
-                                      predictor, 16, 16, strength,
-                                      filter_weight, accumulator, count);
-          vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                      predictor + 256, mb_uv_width,
-                                      mb_uv_height, strength, filter_weight,
-                                      accumulator + 256, count + 256);
-          vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                      predictor + 512, mb_uv_width,
-                                      mb_uv_height, strength, filter_weight,
-                                      accumulator + 512, count + 512);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-        }
-      }
+      src.buf = f->y_buffer + mb_y_offset;
+      src.stride = f->y_stride;
 
 #if CONFIG_VP9_HIGHBITDEPTH
       if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        uint16_t *dst1_16;
-        uint16_t *dst2_16;
-        // Normalize filter output to produce AltRef frame
-        dst1 = cpi->alt_ref_buffer.y_buffer;
-        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
-        stride = cpi->alt_ref_buffer.y_stride;
-        byte = mb_y_offset;
-        for (i = 0, k = 0; i < 16; i++) {
-          for (j = 0; j < 16; j++, k++) {
-            unsigned int pval = accumulator[k] + (count[k] >> 1);
-            pval *= fixed_divide[count[k]];
-            pval >>= 19;
-
-            dst1_16[byte] = (uint16_t)pval;
-
-            // move to next pixel
-            byte++;
-          }
-
-          byte += stride - 16;
-        }
-
-        dst1 = cpi->alt_ref_buffer.u_buffer;
-        dst2 = cpi->alt_ref_buffer.v_buffer;
-        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
-        dst2_16 = CONVERT_TO_SHORTPTR(dst2);
-        stride = cpi->alt_ref_buffer.uv_stride;
-        byte = mb_uv_offset;
-        for (i = 0, k = 256; i < mb_uv_height; i++) {
-          for (j = 0; j < mb_uv_width; j++, k++) {
-            int m = k + 256;
-
-            // U
-            unsigned int pval = accumulator[k] + (count[k] >> 1);
-            pval *= fixed_divide[count[k]];
-            pval >>= 19;
-            dst1_16[byte] = (uint16_t)pval;
-
-            // V
-            pval = accumulator[m] + (count[m] >> 1);
-            pval *= fixed_divide[count[m]];
-            pval >>= 19;
-            dst2_16[byte] = (uint16_t)pval;
-
-            // move to next pixel
-            byte++;
-          }
-
-          byte += stride - mb_uv_width;
-        }
+        src_variance =
+            vp9_high_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16, mbd->bd);
       } else {
-        // Normalize filter output to produce AltRef frame
-        dst1 = cpi->alt_ref_buffer.y_buffer;
-        stride = cpi->alt_ref_buffer.y_stride;
-        byte = mb_y_offset;
-        for (i = 0, k = 0; i < 16; i++) {
-          for (j = 0; j < 16; j++, k++) {
-            unsigned int pval = accumulator[k] + (count[k] >> 1);
-            pval *= fixed_divide[count[k]];
-            pval >>= 19;
-
-            dst1[byte] = (uint8_t)pval;
-
-            // move to next pixel
-            byte++;
-          }
-          byte += stride - 16;
-        }
-
-        dst1 = cpi->alt_ref_buffer.u_buffer;
-        dst2 = cpi->alt_ref_buffer.v_buffer;
-        stride = cpi->alt_ref_buffer.uv_stride;
-        byte = mb_uv_offset;
-        for (i = 0, k = 256; i < mb_uv_height; i++) {
-          for (j = 0; j < mb_uv_width; j++, k++) {
-            int m = k + 256;
-
-            // U
-            unsigned int pval = accumulator[k] + (count[k] >> 1);
-            pval *= fixed_divide[count[k]];
-            pval >>= 19;
-            dst1[byte] = (uint8_t)pval;
-
-            // V
-            pval = accumulator[m] + (count[m] >> 1);
-            pval *= fixed_divide[count[m]];
-            pval >>= 19;
-            dst2[byte] = (uint8_t)pval;
-
-            // move to next pixel
-            byte++;
-          }
-          byte += stride - mb_uv_width;
-        }
+        src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16);
       }
 #else
+      src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      if (src_variance <= 2) strength = VPXMAX(0, (int)strength - 2);
+    }
+
+    for (frame = 0; frame < frame_count; frame++) {
+      const uint32_t thresh_low = 10000;
+      const uint32_t thresh_high = 20000;
+
+      if (frames[frame] == NULL) continue;
+
+      ref_mv.row = 0;
+      ref_mv.col = 0;
+
+      if (frame == alt_ref_index) {
+        filter_weight = 2;
+      } else {
+        // Find best match in this frame by MC
+        uint32_t err = temporal_filter_find_matching_mb_c(
+            cpi, td, frames[alt_ref_index]->y_buffer + mb_y_offset,
+            frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride,
+            &ref_mv);
+
+        // Assign higher weight to matching MB if its error
+        // score is lower. If not applying MC default behavior
+        // is to weight all MBs equal.
+        filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0;
+      }
+
+      if (filter_weight != 0) {
+        // Construct the predictors
+        temporal_filter_predictors_mb_c(
+            mbd, frames[frame]->y_buffer + mb_y_offset,
+            frames[frame]->u_buffer + mb_uv_offset,
+            frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
+            mb_uv_width, mb_uv_height, ref_mv.row, ref_mv.col, predictor, scale,
+            mb_col * 16, mb_row * 16);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          int adj_strength = strength + 2 * (mbd->bd - 8);
+          // Apply the filter (YUV)
+          vp9_highbd_temporal_filter_apply(
+              f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
+              adj_strength, filter_weight, accumulator, count);
+          vp9_highbd_temporal_filter_apply(
+              f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
+              mb_uv_width, mb_uv_height, adj_strength, filter_weight,
+              accumulator + 256, count + 256);
+          vp9_highbd_temporal_filter_apply(
+              f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
+              mb_uv_width, mb_uv_height, adj_strength, filter_weight,
+              accumulator + 512, count + 512);
+        } else {
+          // Apply the filter (YUV)
+          vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+                                    predictor, 16, 16, strength, filter_weight,
+                                    accumulator, count);
+          vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+                                    predictor + 256, mb_uv_width, mb_uv_height,
+                                    strength, filter_weight, accumulator + 256,
+                                    count + 256);
+          vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+                                    predictor + 512, mb_uv_width, mb_uv_height,
+                                    strength, filter_weight, accumulator + 512,
+                                    count + 512);
+        }
+#else
+        // Apply the filter (YUV)
+        vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+                                  predictor, 16, 16, strength, filter_weight,
+                                  accumulator, count);
+        vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+                                  predictor + 256, mb_uv_width, mb_uv_height,
+                                  strength, filter_weight, accumulator + 256,
+                                  count + 256);
+        vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+                                  predictor + 512, mb_uv_width, mb_uv_height,
+                                  strength, filter_weight, accumulator + 512,
+                                  count + 512);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *dst1_16;
+      uint16_t *dst2_16;
+      // Normalize filter output to produce AltRef frame
+      dst1 = cpi->alt_ref_buffer.y_buffer;
+      dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+      stride = cpi->alt_ref_buffer.y_stride;
+      byte = mb_y_offset;
+      for (i = 0, k = 0; i < 16; i++) {
+        for (j = 0; j < 16; j++, k++) {
+          unsigned int pval = accumulator[k] + (count[k] >> 1);
+          pval *= fixed_divide[count[k]];
+          pval >>= 19;
+
+          dst1_16[byte] = (uint16_t)pval;
+
+          // move to next pixel
+          byte++;
+        }
+
+        byte += stride - 16;
+      }
+
+      dst1 = cpi->alt_ref_buffer.u_buffer;
+      dst2 = cpi->alt_ref_buffer.v_buffer;
+      dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+      dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+      stride = cpi->alt_ref_buffer.uv_stride;
+      byte = mb_uv_offset;
+      for (i = 0, k = 256; i < mb_uv_height; i++) {
+        for (j = 0; j < mb_uv_width; j++, k++) {
+          int m = k + 256;
+
+          // U
+          unsigned int pval = accumulator[k] + (count[k] >> 1);
+          pval *= fixed_divide[count[k]];
+          pval >>= 19;
+          dst1_16[byte] = (uint16_t)pval;
+
+          // V
+          pval = accumulator[m] + (count[m] >> 1);
+          pval *= fixed_divide[count[m]];
+          pval >>= 19;
+          dst2_16[byte] = (uint16_t)pval;
+
+          // move to next pixel
+          byte++;
+        }
+
+        byte += stride - mb_uv_width;
+      }
+    } else {
       // Normalize filter output to produce AltRef frame
       dst1 = cpi->alt_ref_buffer.y_buffer;
       stride = cpi->alt_ref_buffer.y_stride;
@@ -554,12 +546,93 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
         }
         byte += stride - mb_uv_width;
       }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      mb_y_offset += 16;
-      mb_uv_offset += mb_uv_width;
     }
-    mb_y_offset += 16 * (f->y_stride - mb_cols);
-    mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
+#else
+    // Normalize filter output to produce AltRef frame
+    dst1 = cpi->alt_ref_buffer.y_buffer;
+    stride = cpi->alt_ref_buffer.y_stride;
+    byte = mb_y_offset;
+    for (i = 0, k = 0; i < 16; i++) {
+      for (j = 0; j < 16; j++, k++) {
+        unsigned int pval = accumulator[k] + (count[k] >> 1);
+        pval *= fixed_divide[count[k]];
+        pval >>= 19;
+
+        dst1[byte] = (uint8_t)pval;
+
+        // move to next pixel
+        byte++;
+      }
+      byte += stride - 16;
+    }
+
+    dst1 = cpi->alt_ref_buffer.u_buffer;
+    dst2 = cpi->alt_ref_buffer.v_buffer;
+    stride = cpi->alt_ref_buffer.uv_stride;
+    byte = mb_uv_offset;
+    for (i = 0, k = 256; i < mb_uv_height; i++) {
+      for (j = 0; j < mb_uv_width; j++, k++) {
+        int m = k + 256;
+
+        // U
+        unsigned int pval = accumulator[k] + (count[k] >> 1);
+        pval *= fixed_divide[count[k]];
+        pval >>= 19;
+        dst1[byte] = (uint8_t)pval;
+
+        // V
+        pval = accumulator[m] + (count[m] >> 1);
+        pval *= fixed_divide[count[m]];
+        pval >>= 19;
+        dst2[byte] = (uint8_t)pval;
+
+        // move to next pixel
+        byte++;
+      }
+      byte += stride - mb_uv_width;
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    mb_y_offset += 16;
+    mb_uv_offset += mb_uv_width;
+  }
+}
+
+static void temporal_filter_iterate_tile_c(VP9_COMP *cpi, int tile_row,
+                                           int tile_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  TileInfo *tile_info =
+      &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+  const int mb_row_start = (tile_info->mi_row_start) >> 1;
+  const int mb_row_end = (tile_info->mi_row_end + 1) >> 1;
+  const int mb_col_start = (tile_info->mi_col_start) >> 1;
+  const int mb_col_end = (tile_info->mi_col_end + 1) >> 1;
+  int mb_row;
+
+  for (mb_row = mb_row_start; mb_row < mb_row_end; mb_row++) {
+    vp9_temporal_filter_iterate_row_c(cpi, &cpi->td, mb_row, mb_col_start,
+                                      mb_col_end);
+  }
+}
+
+static void temporal_filter_iterate_c(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int tile_row, tile_col;
+  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+  // Save input state
+  uint8_t *input_buffer[MAX_MB_PLANE];
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
+
+  vp9_init_tile_data(cpi);
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      temporal_filter_iterate_tile_c(cpi, tile_row, tile_col);
+    }
   }
 
   // Restore input state
@@ -638,14 +711,16 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data;
   int frame;
   int frames_to_blur;
   int start_frame;
   int strength;
   int frames_to_blur_backward;
   int frames_to_blur_forward;
-  struct scale_factors sf;
-  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
+  struct scale_factors *sf = &arnr_filter_data->sf;
+  YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames;
+  int rdmult;
 
   // Apply context specific adjustments to the arnr filter parameters.
   adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
@@ -653,6 +728,10 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
   frames_to_blur_forward = ((frames_to_blur - 1) / 2);
   start_frame = distance + frames_to_blur_forward;
 
+  arnr_filter_data->strength = strength;
+  arnr_filter_data->frame_count = frames_to_blur;
+  arnr_filter_data->alt_ref_index = frames_to_blur_backward;
+
   // Setup frame pointers, NULL indicates frame not included in filter.
   for (frame = 0; frame < frames_to_blur; ++frame) {
     const int which_buffer = start_frame - frame;
@@ -670,13 +749,13 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
       int frame_used = 0;
 #if CONFIG_VP9_HIGHBITDEPTH
       vp9_setup_scale_factors_for_frame(
-          &sf, get_frame_new_buffer(cm)->y_crop_width,
+          sf, get_frame_new_buffer(cm)->y_crop_width,
           get_frame_new_buffer(cm)->y_crop_height,
           get_frame_new_buffer(cm)->y_crop_width,
           get_frame_new_buffer(cm)->y_crop_height, cm->use_highbitdepth);
 #else
       vp9_setup_scale_factors_for_frame(
-          &sf, get_frame_new_buffer(cm)->y_crop_width,
+          sf, get_frame_new_buffer(cm)->y_crop_width,
           get_frame_new_buffer(cm)->y_crop_height,
           get_frame_new_buffer(cm)->y_crop_width,
           get_frame_new_buffer(cm)->y_crop_height);
@@ -697,7 +776,8 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
                                "Failed to reallocate alt_ref_buffer");
           }
           frames[frame] = vp9_scale_if_required(
-              cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0);
+              cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0,
+              EIGHTTAP, 0);
           ++frame_used;
         }
       }
@@ -708,17 +788,25 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
 // ARF is produced at the native frame size and resized when coded.
 #if CONFIG_VP9_HIGHBITDEPTH
       vp9_setup_scale_factors_for_frame(
-          &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+          sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
           frames[0]->y_crop_width, frames[0]->y_crop_height,
           cm->use_highbitdepth);
 #else
       vp9_setup_scale_factors_for_frame(
-          &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+          sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
           frames[0]->y_crop_width, frames[0]->y_crop_height);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     }
   }
 
-  temporal_filter_iterate_c(cpi, frames, frames_to_blur,
-                            frames_to_blur_backward, strength, &sf);
+  // Initialize errorperbit and sabperbit.
+  rdmult = (int)vp9_compute_rd_mult_based_on_qindex(cpi, ARNR_FILT_QINDEX);
+  if (rdmult < 1) rdmult = 1;
+  set_error_per_bit(&cpi->td.mb, rdmult);
+  vp9_initialize_me_consts(cpi, &cpi->td.mb, ARNR_FILT_QINDEX);
+
+  if (!cpi->row_mt)
+    temporal_filter_iterate_c(cpi);
+  else
+    vp9_temporal_filter_row_mt(cpi);
 }
diff --git a/libs/libvpx/vp9/encoder/vp9_temporal_filter.h b/libs/libvpx/vp9/encoder/vp9_temporal_filter.h
index f537b8870a..775e49cc53 100644
--- a/libs/libvpx/vp9/encoder/vp9_temporal_filter.h
+++ b/libs/libvpx/vp9/encoder/vp9_temporal_filter.h
@@ -15,9 +15,15 @@
 extern "C" {
 #endif
 
+#define ARNR_FILT_QINDEX 128
+
 void vp9_temporal_filter_init(void);
 void vp9_temporal_filter(VP9_COMP *cpi, int distance);
 
+void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
+                                       int mb_row, int mb_col_start,
+                                       int mb_col_end);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libs/libvpx/vp9/encoder/vp9_tokenize.c b/libs/libvpx/vp9/encoder/vp9_tokenize.c
index dc2616dbe1..814d769be3 100644
--- a/libs/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/libs/libvpx/vp9/encoder/vp9_tokenize.c
@@ -123,7 +123,7 @@ const int16_t vp9_cat6_low_cost[256] = {
   6620, 6632, 6654, 6666, 6677, 6689, 6751, 6763, 6774, 6786, 6808, 6820, 6831,
   6843, 6890, 6902, 6913, 6925, 6947, 6959, 6970, 6982
 };
-const int vp9_cat6_high_cost[64] = {
+const uint16_t vp9_cat6_high_cost[64] = {
   88,    2251,  2727,  4890,  3148,  5311,  5787,  7950,  3666,  5829,  6305,
   8468,  6726,  8889,  9365,  11528, 3666,  5829,  6305,  8468,  6726,  8889,
   9365,  11528, 7244,  9407,  9883,  12046, 10304, 12467, 12943, 15106, 3666,
@@ -133,7 +133,7 @@ const int vp9_cat6_high_cost[64] = {
 };
 
 #if CONFIG_VP9_HIGHBITDEPTH
-const int vp9_cat6_high10_high_cost[256] = {
+const uint16_t vp9_cat6_high10_high_cost[256] = {
   94,    2257,  2733,  4896,  3154,  5317,  5793,  7956,  3672,  5835,  6311,
   8474,  6732,  8895,  9371,  11534, 3672,  5835,  6311,  8474,  6732,  8895,
   9371,  11534, 7250,  9413,  9889,  12052, 10310, 12473, 12949, 15112, 3672,
@@ -159,7 +159,7 @@ const int vp9_cat6_high10_high_cost[256] = {
   18075, 20238, 18496, 20659, 21135, 23298, 19014, 21177, 21653, 23816, 22074,
   24237, 24713, 26876
 };
-const int vp9_cat6_high12_high_cost[1024] = {
+const uint16_t vp9_cat6_high12_high_cost[1024] = {
   100,   2263,  2739,  4902,  3160,  5323,  5799,  7962,  3678,  5841,  6317,
   8480,  6738,  8901,  9377,  11540, 3678,  5841,  6317,  8480,  6738,  8901,
   9377,  11540, 7256,  9419,  9895,  12058, 10316, 12479, 12955, 15118, 3678,
diff --git a/libs/libvpx/vp9/encoder/vp9_tokenize.h b/libs/libvpx/vp9/encoder/vp9_tokenize.h
index c905715d7d..b2f63ffef5 100644
--- a/libs/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/libs/libvpx/vp9/encoder/vp9_tokenize.h
@@ -76,25 +76,18 @@ extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
 extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
 extern const int *vp9_dct_cat_lt_10_value_cost;
 extern const int16_t vp9_cat6_low_cost[256];
-extern const int vp9_cat6_high_cost[64];
-extern const int vp9_cat6_high10_high_cost[256];
-extern const int vp9_cat6_high12_high_cost[1024];
-static INLINE int vp9_get_cost(int16_t token, EXTRABIT extrabits,
-                               const int *cat6_high_table) {
-  if (token != CATEGORY6_TOKEN)
-    return vp9_extra_bits[token].cost[extrabits >> 1];
-  return vp9_cat6_low_cost[(extrabits >> 1) & 0xff] +
-         cat6_high_table[extrabits >> 9];
-}
+extern const uint16_t vp9_cat6_high_cost[64];
+extern const uint16_t vp9_cat6_high10_high_cost[256];
+extern const uint16_t vp9_cat6_high12_high_cost[1024];
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE const int *vp9_get_high_cost_table(int bit_depth) {
+static INLINE const uint16_t *vp9_get_high_cost_table(int bit_depth) {
   return bit_depth == 8 ? vp9_cat6_high_cost
                         : (bit_depth == 10 ? vp9_cat6_high10_high_cost
                                            : vp9_cat6_high12_high_cost);
 }
 #else
-static INLINE const int *vp9_get_high_cost_table(int bit_depth) {
+static INLINE const uint16_t *vp9_get_high_cost_table(int bit_depth) {
   (void)bit_depth;
   return vp9_cat6_high_cost;
 }
@@ -118,7 +111,7 @@ static INLINE int16_t vp9_get_token(int v) {
 }
 
 static INLINE int vp9_get_token_cost(int v, int16_t *token,
-                                     const int *cat6_high_table) {
+                                     const uint16_t *cat6_high_table) {
   if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
     EXTRABIT extrabits;
     *token = CATEGORY6_TOKEN;
diff --git a/libs/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/libs/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
new file mode 100644
index 0000000000..460dab6593
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
@@ -0,0 +1,376 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+// Division using multiplication and shifting. The C implementation does:
+// modifier *= 3;
+// modifier /= index;
+// where 'modifier' is a set of summed values and 'index' is the number of
+// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values
+// which may be bound by the edges of the block being filtered.
+//
+// This equation works out to (m * 3) / i which reduces to:
+// m * 3/4
+// m * 1/2
+// m * 1/3
+//
+// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
+// m * C / 65536
+// we can create a C to replicate the division.
+//
+// m * 49152 / 65536 = m * 3/4
+// m * 32758 / 65536 = m * 1/2
+// m * 21846 / 65536 = m * 0.3333
+//
+// These are loaded using an instruction expecting int16_t values but are used
+// with _mm_mulhi_epu16(), which treats them as unsigned.
+#define NEIGHBOR_CONSTANT_4 (int16_t)49152
+#define NEIGHBOR_CONSTANT_6 (int16_t)32768
+#define NEIGHBOR_CONSTANT_9 (int16_t)21846
+
+// Load values from 'a' and 'b'. Compute the difference squared and sum
+// neighboring values such that:
+// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
+// Values to the left and right of the row are set to 0.
+// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.
+static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {
+  const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);
+
+  const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
+  const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);
+
+  const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
+  const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);
+
+  // Shift all the values one place to the left/right so we can efficiently sum
+  // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
+  const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
+  const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);
+
+  // It becomes necessary to treat the values as unsigned at this point. The
+  // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
+  // forward since the filter is only applied to smooth small pixel changes.
+  // Once the value has saturated to uint16_t it is well outside the useful
+  // range.
+  __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
+  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+  *sum = sum_u16;
+}
+
+static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
+                   __m128i *sum_1) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
+  const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);
+
+  const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
+  const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
+  const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
+  const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);
+
+  const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
+  const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
+  const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
+  const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);
+
+  __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
+  // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
+  __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);
+
+  __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
+  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+  *sum_0 = sum_u16;
+
+  shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
+  shift_right = _mm_srli_si128(diff_sq_1_u16, 2);
+
+  sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
+  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+
+  *sum_1 = sum_u16;
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static __m128i average_8(__m128i sum, const __m128i mul_constants,
+                         const int strength, const int rounding,
+                         const int weight) {
+  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+  const __m128i weight_u16 = _mm_set1_epi16(weight);
+  const __m128i sixteen = _mm_set1_epi16(16);
+
+  // modifier * 3 / index;
+  sum = _mm_mulhi_epu16(sum, mul_constants);
+
+  sum = _mm_adds_epu16(sum, rounding_u16);
+  sum = _mm_srl_epi16(sum, strength_u128);
+
+  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+  // So this needs to use the epu16 version which did not come until SSE4.
+  sum = _mm_min_epu16(sum, sixteen);
+
+  sum = _mm_sub_epi16(sixteen, sum);
+
+  return _mm_mullo_epi16(sum, weight_u16);
+}
+
+static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
+                       const __m128i mul_constants_0,
+                       const __m128i mul_constants_1, const int strength,
+                       const int rounding, const int weight) {
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+  const __m128i weight_u16 = _mm_set1_epi16(weight);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  __m128i input_0, input_1;
+
+  input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0);
+  input_0 = _mm_adds_epu16(input_0, rounding_u16);
+
+  input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1);
+  input_1 = _mm_adds_epu16(input_1, rounding_u16);
+
+  input_0 = _mm_srl_epi16(input_0, strength_u128);
+  input_1 = _mm_srl_epi16(input_1, strength_u128);
+
+  input_0 = _mm_min_epu16(input_0, sixteen);
+  input_1 = _mm_min_epu16(input_1, sixteen);
+  input_0 = _mm_sub_epi16(sixteen, input_0);
+  input_1 = _mm_sub_epi16(sixteen, input_1);
+
+  *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
+  *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
+                                   uint16_t *count, uint32_t *accumulator) {
+  const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+  __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
+  __m128i pred_0_u32, pred_1_u32;
+  __m128i accum_0_u32, accum_1_u32;
+
+  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+  _mm_storeu_si128((__m128i *)count, count_u16);
+
+  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static void accumulate_and_store_16(const __m128i sum_0_u16,
+                                    const __m128i sum_1_u16,
+                                    const uint8_t *pred, uint16_t *count,
+                                    uint32_t *accumulator) {
+  const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
+          count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
+  __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
+          pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
+  __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
+  __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+  count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
+  _mm_storeu_si128((__m128i *)count, count_0_u16);
+
+  count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
+  _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
+
+  pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
+  pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
+  pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
+  pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+  accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
+  accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+  accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
+  accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
+}
+
+void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
+                                      const uint8_t *b, unsigned int width,
+                                      unsigned int height, int strength,
+                                      int weight, uint32_t *accumulator,
+                                      uint16_t *count) {
+  unsigned int h;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+  assert(strength >= 0);
+  assert(strength <= 6);
+
+  assert(weight >= 0);
+  assert(weight <= 2);
+
+  assert(width == 8 || width == 16);
+
+  if (width == 8) {
+    __m128i sum_row_a, sum_row_b, sum_row_c;
+    __m128i mul_constants = _mm_setr_epi16(
+        NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+        NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+        NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+
+    sum_8(a, b, &sum_row_a);
+    sum_8(a + stride, b + width, &sum_row_b);
+    sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
+    sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight);
+    accumulate_and_store_8(sum_row_c, b, count, accumulator);
+
+    a += stride + stride;
+    b += width;
+    count += width;
+    accumulator += width;
+
+    mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
+                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
+
+    for (h = 0; h < height - 2; ++h) {
+      sum_8(a, b + width, &sum_row_c);
+      sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
+      sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
+      sum_row_a =
+          average_8(sum_row_a, mul_constants, strength, rounding, weight);
+      accumulate_and_store_8(sum_row_a, b, count, accumulator);
+
+      a += stride;
+      b += width;
+      count += width;
+      accumulator += width;
+
+      sum_row_a = sum_row_b;
+      sum_row_b = sum_row_c;
+    }
+
+    mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
+                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+    sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
+    sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight);
+    accumulate_and_store_8(sum_row_a, b, count, accumulator);
+
+  } else {  // width == 16
+    __m128i sum_row_a_0, sum_row_a_1;
+    __m128i sum_row_b_0, sum_row_b_1;
+    __m128i sum_row_c_0, sum_row_c_1;
+    __m128i mul_constants_0 = _mm_setr_epi16(
+                NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),
+            mul_constants_1 = _mm_setr_epi16(
+                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+
+    sum_16(a, b, &sum_row_a_0, &sum_row_a_1);
+    sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);
+
+    sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+    sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+
+    average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
+               strength, rounding, weight);
+    accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
+
+    a += stride + stride;
+    b += width;
+    count += width;
+    accumulator += width;
+
+    mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
+                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);
+    mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
+                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
+    for (h = 0; h < height - 2; ++h) {
+      sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);
+
+      sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+      sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);
+      sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+      sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);
+
+      average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1,
+                 strength, rounding, weight);
+      accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);
+
+      a += stride;
+      b += width;
+      count += width;
+      accumulator += width;
+
+      sum_row_a_0 = sum_row_b_0;
+      sum_row_a_1 = sum_row_b_1;
+      sum_row_b_0 = sum_row_c_0;
+      sum_row_b_1 = sum_row_c_1;
+    }
+
+    mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
+                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);
+    mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
+                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
+    sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
+    sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
+
+    average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
+               strength, rounding, weight);
+    accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index 0712779b75..dbd243ac10 100644
--- a/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -15,6 +15,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
 
@@ -71,7 +72,7 @@ static INLINE void transpose_4x4(__m128i *res) {
 }
 
 static void fdct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -181,19 +182,19 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
 
 void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
                             int16_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *zbin_ptr,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+                            int skip_block, const int16_t *round_ptr,
+                            const int16_t *quant_ptr, int16_t *qcoeff_ptr,
                             int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                             uint16_t *eob_ptr, const int16_t *scan_ptr,
                             const int16_t *iscan_ptr) {
   __m128i zero;
   int pass;
+
   // Constants
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -215,8 +216,6 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
   int index = 0;
 
   (void)scan_ptr;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
   (void)coeff_ptr;
 
   // Pre-condition input (shift by two)
@@ -708,61 +707,9 @@ static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
   store_output(&res[7], (output + 7 * stride));
 }
 
-// perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-  // 00 10 01 11 02 12 03 13
-  // 20 30 21 31 22 32 23 33
-  // 04 14 05 15 06 16 07 17
-  // 24 34 25 35 26 36 27 37
-  // 40 50 41 51 42 52 43 53
-  // 60 70 61 71 62 72 63 73
-  // 44 54 45 55 46 56 47 57
-  // 64 74 65 75 66 76 67 77
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-  // 00 10 20 30 01 11 21 31
-  // 40 50 60 70 41 51 61 71
-  // 02 12 22 32 03 13 23 33
-  // 42 52 62 72 43 53 63 73
-  // 04 14 24 34 05 15 25 35
-  // 44 54 64 74 45 55 65 75
-  // 06 16 26 36 07 17 27 37
-  // 46 56 66 76 47 57 67 77
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-  // 00 10 20 30 40 50 60 70
-  // 01 11 21 31 41 51 61 71
-  // 02 12 22 32 42 52 62 72
-  // 03 13 23 33 43 53 63 73
-  // 04 14 24 34 44 54 64 74
-  // 05 15 25 35 45 55 65 75
-  // 06 16 26 36 46 56 66 76
-  // 07 17 27 37 47 57 67 77
-}
-
 static void fdct8_sse2(__m128i *in) {
   // constants
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -897,7 +844,7 @@ static void fdct8_sse2(__m128i *in) {
   in[7] = _mm_packs_epi32(v6, v7);
 
   // transpose
-  array_transpose_8x8(in, in);
+  transpose_16bit_8x8(in, in);
 }
 
 static void fadst8_sse2(__m128i *in) {
@@ -914,7 +861,7 @@ static void fadst8_sse2(__m128i *in) {
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__const_0 = _mm_set1_epi16(0);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
@@ -1127,7 +1074,7 @@ static void fadst8_sse2(__m128i *in) {
   in[7] = _mm_sub_epi16(k__const_0, s1);
 
   // transpose
-  array_transpose_8x8(in, in);
+  transpose_16bit_8x8(in, in);
 }
 
 void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
@@ -1184,23 +1131,6 @@ static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
   write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
 }
 
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
   // perform rounding operations
   right_shift_8x8(res0, 2);
@@ -1212,7 +1142,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
 static void fdct16_8col(__m128i *in) {
   // perform 16x16 1-D DCT for 8 columns
   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
@@ -1559,8 +1489,8 @@ static void fadst16_8col(__m128i *in) {
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -2004,13 +1934,13 @@ static void fadst16_8col(__m128i *in) {
 static void fdct16_sse2(__m128i *in0, __m128i *in1) {
   fdct16_8col(in0);
   fdct16_8col(in1);
-  array_transpose_16x16(in0, in1);
+  transpose_16bit_16x16(in0, in1);
 }
 
 static void fadst16_sse2(__m128i *in0, __m128i *in1) {
   fadst16_8col(in0);
   fadst16_8col(in1);
-  array_transpose_16x16(in0, in1);
+  transpose_16bit_16x16(in0, in1);
 }
 
 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm b/libs/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm
index ced37bd16e..8152dce864 100644
--- a/libs/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm
+++ b/libs/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm
@@ -11,6 +11,7 @@
 %define private_prefix vp9
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION .text
 
@@ -62,25 +63,7 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
   psllw           m0,        2
   psllw           m1,        2
 
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; sign extension
-  mova            m2,             m0
-  mova            m3,             m1
-  punpcklwd       m0,             m0
-  punpcklwd       m1,             m1
-  punpckhwd       m2,             m2
-  punpckhwd       m3,             m3
-  psrad           m0,             16
-  psrad           m1,             16
-  psrad           m2,             16
-  psrad           m3,             16
-  mova            [outputq],      m0
-  mova            [outputq + 16], m2
-  mova            [outputq + 32], m1
-  mova            [outputq + 48], m3
-%else
-  mova            [outputq],      m0
-  mova            [outputq + 16], m1
-%endif
+  STORE_TRAN_LOW 0, outputq, 0, 2, 3
+  STORE_TRAN_LOW 1, outputq, 8, 2, 3
 
   RET
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
index fb2a925417..bf874a09ec 100644
--- a/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -12,23 +12,26 @@
 #include <tmmintrin.h>  // SSSE3
 
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
 void vp9_fdct8x8_quant_ssse3(
-    const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs,
-    int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr,
-    const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-    int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs,
+    int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
     uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) {
   __m128i zero;
   int pass;
+
   // Constants
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
   const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -50,8 +53,6 @@ void vp9_fdct8x8_quant_ssse3(
   int index = 0;
 
   (void)scan_ptr;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
   (void)coeff_ptr;
 
   // Pre-condition input (shift by two)
@@ -328,15 +329,15 @@ void vp9_fdct8x8_quant_ssse3(
         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         dequant = _mm_unpackhi_epi64(dequant, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
@@ -398,20 +399,21 @@ void vp9_fdct8x8_quant_ssse3(
           qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
           qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+          store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+          store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
           coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
           coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+          store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+          store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
         } else {
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+          // Maybe a more efficient way to store 0?
+          store_zero_tran_low(qcoeff_ptr + n_coeffs);
+          store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
 
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+          store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+          store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
         }
       }
 
@@ -452,10 +454,10 @@ void vp9_fdct8x8_quant_ssse3(
     }
   } else {
     do {
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
+      store_zero_tran_low(qcoeff_ptr + n_coeffs);
+      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
       n_coeffs += 8 * 2;
     } while (n_coeffs < 0);
     *eob_ptr = 0;
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
index 91d0602f9d..5930bf491e 100644
--- a/libs/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
+++ b/libs/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -13,7 +13,6 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 
-#include "vpx_ports/emmintrin_compat.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_context_tree.h"
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_error_avx2.c b/libs/libvpx/vp9/encoder/x86/vp9_error_avx2.c
new file mode 100644
index 0000000000..99fef31d16
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_error_avx2.c
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+
+int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                             intptr_t block_size, int64_t *ssz) {
+  __m256i sse_256, ssz_256;
+  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+  __m256i sse_hi, ssz_hi;
+  __m128i sse_128, ssz_128;
+  int64_t sse;
+  const __m256i zero = _mm256_setzero_si256();
+
+  // If the block size is 16 then the results will fit in 32 bits.
+  if (block_size == 16) {
+    __m256i coeff_256, dqcoeff_256, coeff_hi, dqcoeff_hi;
+    // Load 16 elements for coeff and dqcoeff.
+    coeff_256 = load_tran_low(coeff);
+    dqcoeff_256 = load_tran_low(dqcoeff);
+    // dqcoeff - coeff
+    dqcoeff_256 = _mm256_sub_epi16(dqcoeff_256, coeff_256);
+    // madd (dqcoeff - coeff)
+    dqcoeff_256 = _mm256_madd_epi16(dqcoeff_256, dqcoeff_256);
+    // madd coeff
+    coeff_256 = _mm256_madd_epi16(coeff_256, coeff_256);
+    // Save the higher 64 bit of each 128 bit lane.
+    dqcoeff_hi = _mm256_srli_si256(dqcoeff_256, 8);
+    coeff_hi = _mm256_srli_si256(coeff_256, 8);
+    // Add the higher 64 bit to the low 64 bit.
+    dqcoeff_256 = _mm256_add_epi32(dqcoeff_256, dqcoeff_hi);
+    coeff_256 = _mm256_add_epi32(coeff_256, coeff_hi);
+    // Expand each double word in the lower 64 bits to quad word.
+    sse_256 = _mm256_unpacklo_epi32(dqcoeff_256, zero);
+    ssz_256 = _mm256_unpacklo_epi32(coeff_256, zero);
+  } else {
+    int i;
+    assert(block_size % 32 == 0);
+    sse_256 = zero;
+    ssz_256 = zero;
+
+    for (i = 0; i < block_size; i += 32) {
+      __m256i coeff_0, coeff_1, dqcoeff_0, dqcoeff_1;
+      // Load 32 elements for coeff and dqcoeff.
+      coeff_0 = load_tran_low(coeff + i);
+      dqcoeff_0 = load_tran_low(dqcoeff + i);
+      coeff_1 = load_tran_low(coeff + i + 16);
+      dqcoeff_1 = load_tran_low(dqcoeff + i + 16);
+      // dqcoeff - coeff
+      dqcoeff_0 = _mm256_sub_epi16(dqcoeff_0, coeff_0);
+      dqcoeff_1 = _mm256_sub_epi16(dqcoeff_1, coeff_1);
+      // madd (dqcoeff - coeff)
+      dqcoeff_0 = _mm256_madd_epi16(dqcoeff_0, dqcoeff_0);
+      dqcoeff_1 = _mm256_madd_epi16(dqcoeff_1, dqcoeff_1);
+      // madd coeff
+      coeff_0 = _mm256_madd_epi16(coeff_0, coeff_0);
+      coeff_1 = _mm256_madd_epi16(coeff_1, coeff_1);
+      // Add the first madd (dqcoeff - coeff) with the second.
+      dqcoeff_0 = _mm256_add_epi32(dqcoeff_0, dqcoeff_1);
+      // Add the first madd (coeff) with the second.
+      coeff_0 = _mm256_add_epi32(coeff_0, coeff_1);
+      // Expand each double word of madd (dqcoeff - coeff) to quad word.
+      exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_0, zero);
+      exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_0, zero);
+      // expand each double word of madd (coeff) to quad word
+      exp_coeff_lo = _mm256_unpacklo_epi32(coeff_0, zero);
+      exp_coeff_hi = _mm256_unpackhi_epi32(coeff_0, zero);
+      // Add each quad word of madd (dqcoeff - coeff) and madd (coeff).
+      sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_lo);
+      ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_lo);
+      sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_hi);
+      ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_hi);
+    }
+  }
+  // Save the higher 64 bit of each 128 bit lane.
+  sse_hi = _mm256_srli_si256(sse_256, 8);
+  ssz_hi = _mm256_srli_si256(ssz_256, 8);
+  // Add the higher 64 bit to the low 64 bit.
+  sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+  ssz_256 = _mm256_add_epi64(ssz_256, ssz_hi);
+
+  // Add each 64 bit from each of the 128 bit lane of the 256 bit.
+  sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+                          _mm256_extractf128_si256(sse_256, 1));
+
+  ssz_128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_256),
+                          _mm256_extractf128_si256(ssz_256, 1));
+
+  // Store the results.
+  _mm_storel_epi64((__m128i *)(&sse), sse_128);
+
+  _mm_storel_epi64((__m128i *)(ssz), ssz_128);
+  return sse;
+}
+
+int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff,
+                                const tran_low_t *dqcoeff, int block_size) {
+  int i;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i sse_256 = zero;
+  __m256i sse_hi;
+  __m128i sse_128;
+  int64_t sse;
+
+  if (block_size == 16) {
+    // Load 16 elements for coeff and dqcoeff.
+    const __m256i _coeff = load_tran_low(coeff);
+    const __m256i _dqcoeff = load_tran_low(dqcoeff);
+    // dqcoeff - coeff
+    const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+    // madd (dqcoeff - coeff)
+    const __m256i error_lo = _mm256_madd_epi16(diff, diff);
+    // Save the higher 64 bit of each 128 bit lane.
+    const __m256i error_hi = _mm256_srli_si256(error_lo, 8);
+    // Add the higher 64 bit to the low 64 bit.
+    const __m256i error = _mm256_add_epi32(error_lo, error_hi);
+    // Expand each double word in the lower 64 bits to quad word.
+    sse_256 = _mm256_unpacklo_epi32(error, zero);
+  } else {
+    for (i = 0; i < block_size; i += 16) {
+      // Load 16 elements for coeff and dqcoeff.
+      const __m256i _coeff = load_tran_low(coeff);
+      const __m256i _dqcoeff = load_tran_low(dqcoeff);
+      const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+      const __m256i error = _mm256_madd_epi16(diff, diff);
+      // Expand each double word of madd (dqcoeff - coeff) to quad word.
+      const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero);
+      const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero);
+      // Add each quad word of madd (dqcoeff - coeff).
+      sse_256 = _mm256_add_epi64(sse_256, exp_error_lo);
+      sse_256 = _mm256_add_epi64(sse_256, exp_error_hi);
+      coeff += 16;
+      dqcoeff += 16;
+    }
+  }
+  // Save the higher 64 bit of each 128 bit lane.
+  sse_hi = _mm256_srli_si256(sse_256, 8);
+  // Add the higher 64 bit to the low 64 bit.
+  sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+
+  // Add each 64 bit from each of the 128 bit lane of the 256 bit.
+  sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+                          _mm256_extractf128_si256(sse_256, 1));
+
+  // Store the results.
+  _mm_storel_epi64((__m128i *)&sse, sse_128);
+  return sse;
+}
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c b/libs/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c
deleted file mode 100644
index 453af2a403..0000000000
--- a/libs/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Usee of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h>  // AVX2
-
-#include "./vp9_rtcd.h"
-#include "vpx/vpx_integer.h"
-
-int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,
-                             intptr_t block_size, int64_t *ssz) {
-  __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
-  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
-  __m256i sse_reg_64hi, ssz_reg_64hi;
-  __m128i sse_reg128, ssz_reg128;
-  int64_t sse;
-  int i;
-  const __m256i zero_reg = _mm256_set1_epi16(0);
-
-  // init sse and ssz registerd to zero
-  sse_reg = _mm256_set1_epi16(0);
-  ssz_reg = _mm256_set1_epi16(0);
-
-  for (i = 0; i < block_size; i += 16) {
-    // load 32 bytes from coeff and dqcoeff
-    coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
-    dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
-    // dqcoeff - coeff
-    dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
-    // madd (dqcoeff - coeff)
-    dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
-    // madd coeff
-    coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
-    // expand each double word of madd (dqcoeff - coeff) to quad word
-    exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
-    exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
-    // expand each double word of madd (coeff) to quad word
-    exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
-    exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
-    // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
-    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
-    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
-    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
-    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
-  }
-  // save the higher 64 bit of each 128 bit lane
-  sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
-  ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
-  // add the higher 64 bit to the low 64 bit
-  sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
-  ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
-
-  // add each 64 bit from each of the 128 bit lane of the 256 bit
-  sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
-                             _mm256_extractf128_si256(sse_reg, 1));
-
-  ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
-                             _mm256_extractf128_si256(ssz_reg, 1));
-
-  // store the results
-  _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
-
-  _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
-  return sse;
-}
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/libs/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
index 5b0238272b..11d473b2df 100644
--- a/libs/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/libs/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
@@ -11,6 +11,7 @@
 %define private_prefix vp9
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION .text
 
@@ -22,14 +23,14 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
   pxor      m4, m4                 ; sse accumulator
   pxor      m6, m6                 ; ssz accumulator
   pxor      m5, m5                 ; dedicated zero register
-  lea     uqcq, [uqcq+sizeq*2]
-  lea     dqcq, [dqcq+sizeq*2]
-  neg    sizeq
 .loop:
-  mova      m2, [uqcq+sizeq*2]
-  mova      m0, [dqcq+sizeq*2]
-  mova      m3, [uqcq+sizeq*2+mmsize]
-  mova      m1, [dqcq+sizeq*2+mmsize]
+  LOAD_TRAN_LOW 2, uqcq, 0
+  LOAD_TRAN_LOW 0, dqcq, 0
+  LOAD_TRAN_LOW 3, uqcq, 8
+  LOAD_TRAN_LOW 1, dqcq, 8
+  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+  sub    sizeq, 16
   psubw     m0, m2
   psubw     m1, m3
   ; individual errors are max. 15bit+sign, so squares are 30bit, and
@@ -38,25 +39,19 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
   pmaddwd   m1, m1
   pmaddwd   m2, m2
   pmaddwd   m3, m3
+  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+  paddd     m0, m1
+  paddd     m2, m3
   ; accumulate in 64bit
   punpckldq m7, m0, m5
   punpckhdq m0, m5
   paddq     m4, m7
-  punpckldq m7, m1, m5
-  paddq     m4, m0
-  punpckhdq m1, m5
-  paddq     m4, m7
   punpckldq m7, m2, m5
-  paddq     m4, m1
+  paddq     m4, m0
   punpckhdq m2, m5
   paddq     m6, m7
-  punpckldq m7, m3, m5
   paddq     m6, m2
-  punpckhdq m3, m5
-  paddq     m6, m7
-  paddq     m6, m3
-  add    sizeq, mmsize
-  jl .loop
+  jg .loop
 
   ; accumulate horizontally and store in return value
   movhlps   m5, m4
@@ -75,39 +70,37 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
 %endif
   RET
 
-; Compute the sum of squared difference between two int16_t vectors.
-; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+; Compute the sum of squared difference between two tran_low_t vectors.
+; Vectors are converted (if necessary) to int16_t for calculations.
+; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff,
 ;                            intptr_t block_size)
 
 INIT_XMM sse2
 cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
   pxor      m4, m4                 ; sse accumulator
   pxor      m5, m5                 ; dedicated zero register
-  lea     uqcq, [uqcq+sizeq*2]
-  lea     dqcq, [dqcq+sizeq*2]
-  neg    sizeq
 .loop:
-  mova      m2, [uqcq+sizeq*2]
-  mova      m0, [dqcq+sizeq*2]
-  mova      m3, [uqcq+sizeq*2+mmsize]
-  mova      m1, [dqcq+sizeq*2+mmsize]
+  LOAD_TRAN_LOW 2, uqcq, 0
+  LOAD_TRAN_LOW 0, dqcq, 0
+  LOAD_TRAN_LOW 3, uqcq, 8
+  LOAD_TRAN_LOW 1, dqcq, 8
+  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+  sub    sizeq, 16
   psubw     m0, m2
   psubw     m1, m3
   ; individual errors are max. 15bit+sign, so squares are 30bit, and
   ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
   pmaddwd   m0, m0
   pmaddwd   m1, m1
+  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+  paddd     m0, m1
   ; accumulate in 64bit
   punpckldq m3, m0, m5
   punpckhdq m0, m5
   paddq     m4, m3
-  punpckldq m3, m1, m5
   paddq     m4, m0
-  punpckhdq m1, m5
-  paddq     m4, m3
-  paddq     m4, m1
-  add    sizeq, mmsize
-  jl .loop
+  jnz .loop
 
   ; accumulate horizontally and store in return value
   movhlps   m5, m4
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/libs/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index fa2a6449b0..7685e7bc3e 100644
--- a/libs/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/libs/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -13,192 +13,895 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_scale/yv12config.h"
 
-extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst);
+static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
+    const uint8_t *const src, const __m128i *const mask) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
+  const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
+  const __m128i a_and = _mm_and_si128(a, *mask);
+  const __m128i b_and = _mm_and_si128(b, *mask);
+  return _mm_packus_epi16(a_and, b_and);
+}
 
-static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride, int w,
-                                    int h) {
+static void scale_plane_2_to_1_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int dst_w, const int dst_h) {
+  const int max_width = (dst_w + 15) & ~15;
   const __m128i mask = _mm_set1_epi16(0x00FF);
-  const int max_width = w & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
+      _mm_storeu_si128((__m128i *)dst, d);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_4_to_1_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int dst_w, const int dst_h) {
+  const int max_width = (dst_w + 15) & ~15;
+  const __m128i mask = _mm_set1_epi32(0x000000FF);
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask);
+      const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask);
+      const __m128i d2 = _mm_packus_epi16(d0, d1);
+      _mm_storeu_si128((__m128i *)dst, d2);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
+                                                  const __m128i c0c1) {
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
+  const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
+  // round and shift by 7 bit each 16 bit
+  const __m128i t2 = _mm_adds_epi16(t0, k_64);
+  const __m128i t3 = _mm_adds_epi16(t1, k_64);
+  const __m128i t4 = _mm_srai_epi16(t2, 7);
+  const __m128i t5 = _mm_srai_epi16(t3, 7);
+  return _mm_packus_epi16(t4, t5);
+}
+
+static void scale_plane_2_to_1_bilinear(const uint8_t *src,
+                                        const ptrdiff_t src_stride,
+                                        uint8_t *dst,
+                                        const ptrdiff_t dst_stride,
+                                        const int dst_w, const int dst_h,
+                                        const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      __m128i s[2], d[2];
+
+      // Horizontal
+      // Even rows
+      s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
+      d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+      // odd rows
+      s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+      d[1] = scale_plane_bilinear_kernel(s, c0c1);
+
+      // Vertical
+      s[0] = _mm_unpacklo_epi8(d[0], d[1]);
+      s[1] = _mm_unpackhi_epi8(d[0], d[1]);
+      d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+      _mm_storeu_si128((__m128i *)dst, d[0]);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_4_to_1_bilinear(const uint8_t *src,
+                                        const ptrdiff_t src_stride,
+                                        uint8_t *dst,
+                                        const ptrdiff_t dst_stride,
+                                        const int dst_w, const int dst_h,
+                                        const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      __m128i s[8], d[8];
+
+      // Note: Using _mm_packus_epi32() in SSE4.1 could be faster.
+      //       Here we tried to not use shuffle instructions which would be slow
+      //       on some x86 CPUs.
+
+      // Horizontal
+      // 000 001 xx xx 004 005 xx xx  008 009 xx xx 00C 00D xx xx
+      // 010 011 xx xx 014 015 xx xx  018 019 xx xx 01C 01D xx xx
+      // 020 021 xx xx 024 025 xx xx  028 029 xx xx 02C 02D xx xx
+      // 030 031 xx xx 034 035 xx xx  038 039 xx xx 03C 03D xx xx
+      // 100 101 xx xx 104 105 xx xx  108 109 xx xx 10C 10D xx xx
+      // 110 111 xx xx 114 115 xx xx  118 119 xx xx 11C 11D xx xx
+      // 120 121 xx xx 124 125 xx xx  128 129 xx xx 12C 12D xx xx
+      // 130 131 xx xx 134 135 xx xx  138 139 xx xx 13C 13D xx xx
+      s[0] = _mm_loadu_si128((const __m128i *)(&src[0]));
+      s[1] = _mm_loadu_si128((const __m128i *)(&src[16]));
+      s[2] = _mm_loadu_si128((const __m128i *)(&src[32]));
+      s[3] = _mm_loadu_si128((const __m128i *)(&src[48]));
+      s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+      s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+      s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32));
+      s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48));
+
+      // 000 001 100 101 xx xx xx xx  004 005 104 105 xx xx xx xx
+      // 008 009 108 109 xx xx xx xx  00C 00D 10C 10D xx xx xx xx
+      // 010 011 110 111 xx xx xx xx  014 015 114 115 xx xx xx xx
+      // 018 019 118 119 xx xx xx xx  01C 01D 11C 11D xx xx xx xx
+      // 020 021 120 121 xx xx xx xx  024 025 124 125 xx xx xx xx
+      // 028 029 128 129 xx xx xx xx  02C 02D 12C 12D xx xx xx xx
+      // 030 031 130 131 xx xx xx xx  034 035 134 135 xx xx xx xx
+      // 038 039 138 139 xx xx xx xx  03C 03D 13C 13D xx xx xx xx
+      d[0] = _mm_unpacklo_epi16(s[0], s[4]);
+      d[1] = _mm_unpackhi_epi16(s[0], s[4]);
+      d[2] = _mm_unpacklo_epi16(s[1], s[5]);
+      d[3] = _mm_unpackhi_epi16(s[1], s[5]);
+      d[4] = _mm_unpacklo_epi16(s[2], s[6]);
+      d[5] = _mm_unpackhi_epi16(s[2], s[6]);
+      d[6] = _mm_unpacklo_epi16(s[3], s[7]);
+      d[7] = _mm_unpackhi_epi16(s[3], s[7]);
+
+      // 000 001 100 101 008 009 108 109  xx xx xx xx xx xx xx xx
+      // 004 005 104 105 00C 00D 10C 10D  xx xx xx xx xx xx xx xx
+      // 010 011 110 111 018 019 118 119  xx xx xx xx xx xx xx xx
+      // 014 015 114 115 01C 01D 11C 11D  xx xx xx xx xx xx xx xx
+      // 020 021 120 121 028 029 128 129  xx xx xx xx xx xx xx xx
+      // 024 025 124 125 02C 02D 12C 12D  xx xx xx xx xx xx xx xx
+      // 030 031 130 131 038 039 138 139  xx xx xx xx xx xx xx xx
+      // 034 035 134 135 03C 03D 13C 13D  xx xx xx xx xx xx xx xx
+      s[0] = _mm_unpacklo_epi32(d[0], d[1]);
+      s[1] = _mm_unpackhi_epi32(d[0], d[1]);
+      s[2] = _mm_unpacklo_epi32(d[2], d[3]);
+      s[3] = _mm_unpackhi_epi32(d[2], d[3]);
+      s[4] = _mm_unpacklo_epi32(d[4], d[5]);
+      s[5] = _mm_unpackhi_epi32(d[4], d[5]);
+      s[6] = _mm_unpacklo_epi32(d[6], d[7]);
+      s[7] = _mm_unpackhi_epi32(d[6], d[7]);
+
+      // 000 001 100 101 004 005 104 105  008 009 108 109 00C 00D 10C 10D
+      // 010 011 110 111 014 015 114 115  018 019 118 119 01C 01D 11C 11D
+      // 020 021 120 121 024 025 124 125  028 029 128 129 02C 02D 12C 12D
+      // 030 031 130 131 034 035 134 135  038 039 138 139 03C 03D 13C 13D
+      d[0] = _mm_unpacklo_epi32(s[0], s[1]);
+      d[1] = _mm_unpacklo_epi32(s[2], s[3]);
+      d[2] = _mm_unpacklo_epi32(s[4], s[5]);
+      d[3] = _mm_unpacklo_epi32(s[6], s[7]);
+
+      d[0] = scale_plane_bilinear_kernel(&d[0], c0c1);
+      d[1] = scale_plane_bilinear_kernel(&d[2], c0c1);
+
+      // Vertical
+      d[0] = scale_plane_bilinear_kernel(d, c0c1);
+
+      _mm_storeu_si128((__m128i *)dst, d[0]);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 3) & ~3;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 3) & ~3;
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[11], d[4];
+  __m128i f[4];
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef, f);
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+  // horizontal 4x8
+  do {
+    load_8bit_8x8(src + 2, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[3]);
+      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      // 0C 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
+      transpose_16bit_4x8(&s[3], &s[3]);
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
+      d[1] = convolve8_8_ssse3(&s[1], f);  // 01 11 21 31 41 51 61 71
+      d[2] = convolve8_8_ssse3(&s[2], f);  // 02 12 22 32 42 52 62 72
+      d[3] = convolve8_8_ssse3(&s[3], f);  // 03 13 23 33 43 53 63 73
+
+      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
+      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
+      d[0] = _mm_packus_epi16(d[0], d[2]);
+      d[1] = _mm_packus_epi16(d[1], d[3]);
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
+      d[2] = _mm_unpacklo_epi16(d[0], d[1]);
+      d[3] = _mm_unpackhi_epi16(d[0], d[1]);
+      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
+      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
+      d[0] = _mm_unpacklo_epi32(d[2], d[3]);
+      d[1] = _mm_unpackhi_epi32(d[2], d[3]);
+      store_8bit_8x4_from_16x2(d, t, 2 * width_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+
+      t += 8;
+      x -= 4;
+    } while (x);
+    src += 8 * src_stride - 2 * width_hor;
+    t += 6 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x4
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+    s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor));
+    t += 6 * width_hor;
+    y = height_ver;
+
+    do {
+      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
+      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 77
+      loadu_8bit_16x4(t, 2 * width_hor, &s[3]);
+      t += 8 * width_hor;
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
+      d[1] = convolve8_8_ssse3(&s[1], f);  // 10 11 12 13 14 15 16 17
+      d[2] = convolve8_8_ssse3(&s[2], f);  // 20 21 22 23 24 25 26 27
+      d[3] = convolve8_8_ssse3(&s[3], f);  // 30 31 32 33 34 35 36 37
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      d[1] = _mm_packus_epi16(d[2], d[3]);
+      store_8bit_8x4_from_16x2(d, dst, dst_stride);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+
+      dst += 4 * dst_stride;
+      y -= 4;
+    } while (y);
+    t -= width_hor * (2 * height_ver + 6);
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 1) & ~1;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 1) & ~1;
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[11], d[4];
+  __m128i f[4];
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef, f);
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+  // horizontal 2x8
+  do {
+    load_8bit_8x8(src + 4, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75 (overlapped)
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[2]);
+      // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      transpose_16bit_4x8(&s[2], &s[2]);
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
+      d[1] = convolve8_8_ssse3(&s[2], f);  // 01 11 21 31 41 51 61 71
+
+      // 00 10 20 30 40 50 60 70  xx xx xx xx xx xx xx xx
+      // 01 11 21 31 41 51 61 71  xx xx xx xx xx xx xx xx
+      d[0] = _mm_packus_epi16(d[0], d[0]);
+      d[1] = _mm_packus_epi16(d[1], d[1]);
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      d[0] = _mm_unpacklo_epi16(d[0], d[1]);
+      store_8bit_4x4_sse2(d[0], t, 2 * width_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+
+      t += 4;
+      x -= 2;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor;
+    t += 6 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x2
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+    t += 4 * width_hor;
+    y = height_ver;
+
+    do {
+      // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
+      loadu_8bit_16x4(t, 2 * width_hor, &s[2]);
+      t += 8 * width_hor;
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
+      d[1] = convolve8_8_ssse3(&s[2], f);  // 10 11 12 13 14 15 16 17
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+
+      s[0] = s[4];
+      s[1] = s[5];
+
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+    t -= width_hor * (4 * height_ver + 4);
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+typedef void (*shuffle_filter_funcs)(const int16_t *const filter,
+                                     __m128i *const f);
+
+typedef __m128i (*convolve8_funcs)(const __m128i *const s,
+                                   const __m128i *const f);
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const InterpKernel *const coef,
+                                       const int phase_scaler,
+                                       uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = 2 * width_hor + 4;  // store 4 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+  // above and (SUBPEL_TAPS / 2) extra rows below.
+  const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[12], d[6], dd[4];
+  __m128i f0[4], f1[5], f2[5];
+  // The offset of the first row is always less than 1 pixel.
+  const int offset1_q4 = phase_scaler + 1 * step_q4;
+  const int offset2_q4 = phase_scaler + 2 * step_q4;
+  // offset_idxx indicates the pixel offset is even (0) or odd (1).
+  // It's used to choose the src offset and filter coefficient offset.
+  const int offset_idx1 = (offset1_q4 >> 4) & 1;
+  const int offset_idx2 = (offset2_q4 >> 4) & 1;
+  static const shuffle_filter_funcs shuffle_filter_funcs[2] = {
+    shuffle_filter_ssse3, shuffle_filter_odd_ssse3
+  };
+  static const convolve8_funcs convolve8_funcs[2] = {
+    convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
+  };
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0);
+  shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+  shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+
+  // Sub 64 to avoid overflow.
+  // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
+  // Coef 128 is in either fx[1] or fx[2] depending on the phase idx.
+  // When filter phase idx is 1, the two biggest coefficients are shuffled
+  // together, and the sum of them are always no less than 128. Sub 64 here.
+  // After the subtraction, when the sum of all positive coefficients are no
+  // larger than 128, and the sum of all negative coefficients are no
+  // less than -128, there will be no overflow in the convolve8 functions.
+  f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64));
+  f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64));
+  f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64));
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1;
+
+  // horizontal 6x8
+  do {
+    load_8bit_8x8(src, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[4]);
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      // OC 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
+      // 0E 0F 1E 1F 2E 2F 3E 3F  4E 4F 5E 5F 6E 6F 7E 7F
+      transpose_16bit_4x8(&s[4], &s[4]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+      d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+      d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
+      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74  xx xx xx xx xx xx xx xx
+      // 05 15 25 35 45 55 65 75  xx xx xx xx xx xx xx xx
+      dd[0] = _mm_packus_epi16(d[0], d[2]);
+      dd[1] = _mm_packus_epi16(d[1], d[3]);
+      dd[2] = _mm_packus_epi16(d[4], d[4]);
+      dd[3] = _mm_packus_epi16(d[5], d[5]);
+
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
+      // 04 14 05 15 24 34 25 35  44 54 45 55 64 74 65 75
+      d[0] = _mm_unpacklo_epi16(dd[0], dd[1]);
+      d[1] = _mm_unpackhi_epi16(dd[0], dd[1]);
+      d[2] = _mm_unpacklo_epi16(dd[2], dd[3]);
+
+      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
+      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
+      // 04 14 05 15 xx xx xx xx  24 34 25 35 xx xx xx xx
+      // 44 54 45 55 xx xx xx xx  64 74 65 75 xx xx xx xx
+      dd[0] = _mm_unpacklo_epi32(d[0], d[1]);
+      dd[1] = _mm_unpackhi_epi32(d[0], d[1]);
+      dd[2] = _mm_unpacklo_epi32(d[2], d[2]);
+      dd[3] = _mm_unpackhi_epi32(d[2], d[2]);
+
+      // 00 10 01 11 02 12 03 13  04 14 05 15 xx xx xx xx
+      // 20 30 21 31 22 32 23 33  24 34 25 35 xx xx xx xx
+      // 40 50 41 51 42 52 43 53  44 54 45 55 xx xx xx xx
+      // 60 70 61 71 62 72 63 73  64 74 65 75 xx xx xx xx
+      d[0] = _mm_unpacklo_epi64(dd[0], dd[2]);
+      d[1] = _mm_unpackhi_epi64(dd[0], dd[2]);
+      d[2] = _mm_unpacklo_epi64(dd[1], dd[3]);
+      d[3] = _mm_unpackhi_epi64(dd[1], dd[3]);
+
+      // store 4 extra pixels
+      storeu_8bit_16x4(d, t, stride_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+      s[3] = s[7];
+
+      t += 12;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3;
+    t += 3 * stride_hor + 4;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+    // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+    loadu_8bit_16x4(t, stride_hor, s);
+    y = height_ver;
+
+    do {
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 97
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 B7
+      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 D7
+      // E0 F0 E1 F1 E2 F2 E3 F3  E4 F4 E5 F5 E6 F6 E7 F7
+      t += 4 * stride_hor;
+      loadu_8bit_16x4(t, stride_hor, &s[4]);
+
+      d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+      d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+      d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      d[2] = _mm_packus_epi16(d[2], d[3]);
+      d[4] = _mm_packus_epi16(d[4], d[5]);
+
+      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]);
+      _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]);
+      _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]);
+      _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+      s[3] = s[7];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * 2 * height_ver / 3;
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
+                                                  const __m128i *const f) {
+  __m128i ss[4], temp;
+
+  ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+  ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+  ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+  temp = convolve8_8_ssse3(ss, f);
+  return _mm_packus_epi16(temp, temp);
+}
+
+// Only calculate odd columns since even columns are just src pixels' copies.
+static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst,
+                                     const int w, const __m128i *const f) {
+  int x = w;
+
+  do {
+    __m128i s[8], temp;
+    s[0] = _mm_loadl_epi64((const __m128i *)(src + 0));
+    s[1] = _mm_loadl_epi64((const __m128i *)(src + 1));
+    s[2] = _mm_loadl_epi64((const __m128i *)(src + 2));
+    s[3] = _mm_loadl_epi64((const __m128i *)(src + 3));
+    s[4] = _mm_loadl_epi64((const __m128i *)(src + 4));
+    s[5] = _mm_loadl_epi64((const __m128i *)(src + 5));
+    s[6] = _mm_loadl_epi64((const __m128i *)(src + 6));
+    s[7] = _mm_loadl_epi64((const __m128i *)(src + 7));
+    temp = scale_1_to_2_phase_0_kernel(s, f);
+    _mm_storel_epi64((__m128i *)dst, temp);
+    src += 8;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_1_to_2_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int src_w, const int src_h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  int max_width;
   int y;
-  for (y = 0; y < h; ++y) {
+  uint8_t *tmp[9];
+  __m128i f[4];
+
+  max_width = (src_w + 7) & ~7;
+  tmp[0] = temp_buffer + 0 * max_width;
+  tmp[1] = temp_buffer + 1 * max_width;
+  tmp[2] = temp_buffer + 2 * max_width;
+  tmp[3] = temp_buffer + 3 * max_width;
+  tmp[4] = temp_buffer + 4 * max_width;
+  tmp[5] = temp_buffer + 5 * max_width;
+  tmp[6] = temp_buffer + 6 * max_width;
+  tmp[7] = temp_buffer + 7 * max_width;
+
+  shuffle_filter_ssse3(coef, f);
+
+  scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f);
+  scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f);
+  scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f);
+  scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f);
+  scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f);
+  scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f);
+  scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f);
+
+  y = src_h;
+  do {
     int x;
-    for (x = 0; x < max_width; x += 16) {
-      const __m128i a = _mm_loadu_si128((const __m128i *)(src + x * 2 + 0));
-      const __m128i b = _mm_loadu_si128((const __m128i *)(src + x * 2 + 16));
-      const __m128i a_and = _mm_and_si128(a, mask);
-      const __m128i b_and = _mm_and_si128(b, mask);
-      const __m128i c = _mm_packus_epi16(a_and, b_and);
-      _mm_storeu_si128((__m128i *)(dst + x), c);
+    scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f);
+    for (x = 0; x < max_width; x += 8) {
+      __m128i s[8], C, D, CD;
+
+      // Even rows
+      const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x));
+      const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+      const __m128i ab = _mm_unpacklo_epi8(a, b);
+      _mm_storeu_si128((__m128i *)(dst + 2 * x), ab);
+
+      // Odd rows
+      // Even columns
+      load_8bit_8x8(src + x - 3 * src_stride, src_stride, s);
+      C = scale_1_to_2_phase_0_kernel(s, f);
+
+      // Odd columns
+      s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x));
+      s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x));
+      s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x));
+      s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+      s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x));
+      s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x));
+      s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x));
+      s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x));
+      D = scale_1_to_2_phase_0_kernel(s, f);
+
+      CD = _mm_unpacklo_epi8(C, D);
+      _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD);
     }
-    for (; x < w; ++x) dst[x] = src[x * 2];
-    src += src_stride * 2;
-    dst += dst_stride;
-  }
-}
 
-static INLINE __m128i filter(const __m128i *const a, const __m128i *const b,
-                             const __m128i *const c, const __m128i *const d,
-                             const __m128i *const e, const __m128i *const f,
-                             const __m128i *const g, const __m128i *const h) {
-  const __m128i coeffs_ab =
-      _mm_set_epi8(6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1);
-  const __m128i coeffs_cd = _mm_set_epi8(78, -19, 78, -19, 78, -19, 78, -19, 78,
-                                         -19, 78, -19, 78, -19, 78, -19);
-  const __m128i const64_x16 = _mm_set1_epi16(64);
-  const __m128i ab = _mm_unpacklo_epi8(*a, *b);
-  const __m128i cd = _mm_unpacklo_epi8(*c, *d);
-  const __m128i fe = _mm_unpacklo_epi8(*f, *e);
-  const __m128i hg = _mm_unpacklo_epi8(*h, *g);
-  const __m128i ab_terms = _mm_maddubs_epi16(ab, coeffs_ab);
-  const __m128i cd_terms = _mm_maddubs_epi16(cd, coeffs_cd);
-  const __m128i fe_terms = _mm_maddubs_epi16(fe, coeffs_cd);
-  const __m128i hg_terms = _mm_maddubs_epi16(hg, coeffs_ab);
-  // can not overflow
-  const __m128i abcd_terms = _mm_add_epi16(ab_terms, cd_terms);
-  // can not overflow
-  const __m128i fehg_terms = _mm_add_epi16(fe_terms, hg_terms);
-  // can overflow, use saturating add
-  const __m128i terms = _mm_adds_epi16(abcd_terms, fehg_terms);
-  const __m128i round = _mm_adds_epi16(terms, const64_x16);
-  const __m128i shift = _mm_srai_epi16(round, 7);
-  return _mm_packus_epi16(shift, shift);
-}
-
-static void eight_tap_row_ssse3(const uint8_t *src, uint8_t *dst, int w) {
-  const int max_width = w & ~7;
-  int x = 0;
-  for (; x < max_width; x += 8) {
-    const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x + 0));
-    const __m128i b = _mm_loadl_epi64((const __m128i *)(src + x + 1));
-    const __m128i c = _mm_loadl_epi64((const __m128i *)(src + x + 2));
-    const __m128i d = _mm_loadl_epi64((const __m128i *)(src + x + 3));
-    const __m128i e = _mm_loadl_epi64((const __m128i *)(src + x + 4));
-    const __m128i f = _mm_loadl_epi64((const __m128i *)(src + x + 5));
-    const __m128i g = _mm_loadl_epi64((const __m128i *)(src + x + 6));
-    const __m128i h = _mm_loadl_epi64((const __m128i *)(src + x + 7));
-    const __m128i pack = filter(&a, &b, &c, &d, &e, &f, &g, &h);
-    _mm_storel_epi64((__m128i *)(dst + x), pack);
-  }
-}
-
-static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride, int dst_w,
-                                  int dst_h) {
-  dst_w /= 2;
-  dst_h /= 2;
-  {
-    DECLARE_ALIGNED(16, uint8_t, tmp[1920 * 8]);
-    uint8_t *tmp0 = tmp + dst_w * 0;
-    uint8_t *tmp1 = tmp + dst_w * 1;
-    uint8_t *tmp2 = tmp + dst_w * 2;
-    uint8_t *tmp3 = tmp + dst_w * 3;
-    uint8_t *tmp4 = tmp + dst_w * 4;
-    uint8_t *tmp5 = tmp + dst_w * 5;
-    uint8_t *tmp6 = tmp + dst_w * 6;
-    uint8_t *tmp7 = tmp + dst_w * 7;
-    uint8_t *tmp8 = NULL;
-    const int max_width = dst_w & ~7;
-    int y;
-    eight_tap_row_ssse3(src - src_stride * 3 - 3, tmp0, dst_w);
-    eight_tap_row_ssse3(src - src_stride * 2 - 3, tmp1, dst_w);
-    eight_tap_row_ssse3(src - src_stride * 1 - 3, tmp2, dst_w);
-    eight_tap_row_ssse3(src + src_stride * 0 - 3, tmp3, dst_w);
-    eight_tap_row_ssse3(src + src_stride * 1 - 3, tmp4, dst_w);
-    eight_tap_row_ssse3(src + src_stride * 2 - 3, tmp5, dst_w);
-    eight_tap_row_ssse3(src + src_stride * 3 - 3, tmp6, dst_w);
-    for (y = 0; y < dst_h; y++) {
-      int x;
-      eight_tap_row_ssse3(src + src_stride * 4 - 3, tmp7, dst_w);
-      for (x = 0; x < max_width; x += 8) {
-        const __m128i A = _mm_loadl_epi64((const __m128i *)(src + x));
-        const __m128i B = _mm_loadl_epi64((const __m128i *)(tmp3 + x));
-        const __m128i AB = _mm_unpacklo_epi8(A, B);
-        __m128i C, D, CD;
-        _mm_storeu_si128((__m128i *)(dst + x * 2), AB);
-        {
-          const __m128i a =
-              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 3));
-          const __m128i b =
-              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 2));
-          const __m128i c =
-              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 1));
-          const __m128i d =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 0));
-          const __m128i e =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 1));
-          const __m128i f =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 2));
-          const __m128i g =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 3));
-          const __m128i h =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 4));
-          C = filter(&a, &b, &c, &d, &e, &f, &g, &h);
-        }
-        {
-          const __m128i a = _mm_loadl_epi64((const __m128i *)(tmp0 + x));
-          const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp1 + x));
-          const __m128i c = _mm_loadl_epi64((const __m128i *)(tmp2 + x));
-          const __m128i d = _mm_loadl_epi64((const __m128i *)(tmp3 + x));
-          const __m128i e = _mm_loadl_epi64((const __m128i *)(tmp4 + x));
-          const __m128i f = _mm_loadl_epi64((const __m128i *)(tmp5 + x));
-          const __m128i g = _mm_loadl_epi64((const __m128i *)(tmp6 + x));
-          const __m128i h = _mm_loadl_epi64((const __m128i *)(tmp7 + x));
-          D = filter(&a, &b, &c, &d, &e, &f, &g, &h);
-        }
-        CD = _mm_unpacklo_epi8(C, D);
-        _mm_storeu_si128((__m128i *)(dst + x * 2 + dst_stride), CD);
-      }
-      src += src_stride;
-      dst += dst_stride * 2;
-      tmp8 = tmp0;
-      tmp0 = tmp1;
-      tmp1 = tmp2;
-      tmp2 = tmp3;
-      tmp3 = tmp4;
-      tmp4 = tmp5;
-      tmp5 = tmp6;
-      tmp6 = tmp7;
-      tmp7 = tmp8;
-    }
-  }
+    src += src_stride;
+    dst += 2 * dst_stride;
+    tmp[8] = tmp[0];
+    tmp[0] = tmp[1];
+    tmp[1] = tmp[2];
+    tmp[2] = tmp[3];
+    tmp[3] = tmp[4];
+    tmp[4] = tmp[5];
+    tmp[5] = tmp[6];
+    tmp[6] = tmp[7];
+    tmp[7] = tmp[8];
+  } while (--y);
 }
 
 void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
-                                      YV12_BUFFER_CONFIG *dst) {
+                                      YV12_BUFFER_CONFIG *dst,
+                                      uint8_t filter_type, int phase_scaler) {
   const int src_w = src->y_crop_width;
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
   const int dst_h = dst->y_crop_height;
   const int dst_uv_w = dst_w / 2;
   const int dst_uv_h = dst_h / 2;
+  int scaled = 0;
+
+  // phase_scaler is usually 0 or 8.
+  assert(phase_scaler >= 0 && phase_scaler < 16);
 
   if (dst_w * 2 == src_w && dst_h * 2 == src_h) {
-    downsample_2_to_1_ssse3(src->y_buffer, src->y_stride, dst->y_buffer,
-                            dst->y_stride, dst_w, dst_h);
-    downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer,
-                            dst->uv_stride, dst_uv_w, dst_uv_h);
-    downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer,
-                            dst->uv_stride, dst_uv_w, dst_uv_h);
-    vpx_extend_frame_borders(dst);
-  } else if (dst_w == src_w * 2 && dst_h == src_h * 2) {
-    // The upsample() supports widths up to 1920 * 2.  If greater, fall back
-    // to vp9_scale_and_extend_frame_c().
-    if (dst_w / 2 <= 1920) {
-      upsample_1_to_2_ssse3(src->y_buffer, src->y_stride, dst->y_buffer,
-                            dst->y_stride, dst_w, dst_h);
-      upsample_1_to_2_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer,
-                            dst->uv_stride, dst_uv_w, dst_uv_h);
-      upsample_1_to_2_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer,
-                            dst->uv_stride, dst_uv_w, dst_uv_h);
-      vpx_extend_frame_borders(dst);
+    // 2 to 1
+    scaled = 1;
+
+    if (phase_scaler == 0) {
+      scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
+      scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0c1);
+      scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+      scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
     } else {
-      vp9_scale_and_extend_frame_c(src, dst);
+      const int buffer_stride = (dst_w + 3) & ~3;
+      const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height);
+      if (temp_buffer) {
+        scale_plane_2_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_2_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_2_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
     }
+  } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+    // 4 to 1
+    scaled = 1;
+    if (phase_scaler == 0) {
+      scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
+      scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0c1);
+      scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+      scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+    } else {
+      const int buffer_stride = (dst_w + 1) & ~1;
+      const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      // When dst_w is 1 or 2, we need extra padding to avoid heap read overflow
+      const int extra_padding = 16;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
+      if (temp_buffer) {
+        scale_plane_4_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_4_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_4_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+    // 4 to 3
+    const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
+    const int buffer_stride_ver = (dst_w + 7) & ~7;
+    const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+    // When the vertical filter reads more pixels than the horizontal filter
+    // generated in each row, we need extra padding to avoid heap read overflow.
+    // For example, the horizontal filter generates 18 pixels but the vertical
+    // filter reads 24 pixels in a row. The difference is multiplied by 2 since
+    // two rows are interlaced together in the optimization.
+    const int extra_padding = (buffer_stride_ver > buffer_stride_hor)
+                                  ? 2 * (buffer_stride_ver - buffer_stride_hor)
+                                  : 0;
+    const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
+    uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
+    if (temp_buffer) {
+      scaled = 1;
+      scale_plane_4_to_3_general(
+          src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+          dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
+      scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h,
+                                 vp9_filter_kernels[filter_type], phase_scaler,
+                                 temp_buffer);
+      scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h,
+                                 vp9_filter_kernels[filter_type], phase_scaler,
+                                 temp_buffer);
+      free(temp_buffer);
+    }
+  } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) {
+    // 1 to 2
+    uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7));
+    if (temp_buffer) {
+      scaled = 1;
+      scale_plane_1_to_2_phase_0(
+          src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w,
+          src_h, vp9_filter_kernels[filter_type][8], temp_buffer);
+      scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, src_w / 2, src_h / 2,
+                                 vp9_filter_kernels[filter_type][8],
+                                 temp_buffer);
+      scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, src_w / 2, src_h / 2,
+                                 vp9_filter_kernels[filter_type][8],
+                                 temp_buffer);
+      free(temp_buffer);
+    }
+  }
+
+  if (scaled) {
+    vpx_extend_frame_borders(dst);
   } else {
-    vp9_scale_and_extend_frame_c(src, dst);
+    // Call c version for all other scaling ratios.
+    vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
   }
 }
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm b/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm
deleted file mode 100644
index e476323e14..0000000000
--- a/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm
+++ /dev/null
@@ -1,261 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-ALIGN 16
-
-;
-; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
-;                                     intptr_t block_size, int64_t *ssz)
-;
-
-INIT_XMM avx
-cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
-  vzeroupper
-
-  ; If only one iteration is required, then handle this as a special case.
-  ; It is the most frequent case, so we can have a significant gain here
-  ; by not setting up a loop and accumulators.
-  cmp    sizeq, 16
-  jne   .generic
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Common case of size == 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-  ; Load input vectors
-  mova      xm0, [dqcq]
-  packssdw  xm0, [dqcq+16]
-  mova      xm2, [uqcq]
-  packssdw  xm2, [uqcq+16]
-
-  mova      xm1, [dqcq+32]
-  packssdw  xm1, [dqcq+48]
-  mova      xm3, [uqcq+32]
-  packssdw  xm3, [uqcq+48]
-
-  ; Compute the errors.
-  psubw     xm0, xm2
-  psubw     xm1, xm3
-
-  ; Individual errors are max 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
-  pmaddwd   xm2, xm2
-  pmaddwd   xm3, xm3
-
-  pmaddwd   xm0, xm0
-  pmaddwd   xm1, xm1
-
-  ; Squares are always positive, so we can use unsigned arithmetic after
-  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
-  ; fit in 32bits
-  paddd     xm2, xm3
-  paddd     xm0, xm1
-
-  ; Accumulate horizontally in 64 bits, there is no chance of overflow here
-  pxor      xm5, xm5
-
-  pblendw   xm3, xm5, xm2, 0x33 ; Zero extended  low of a pair of 32 bits
-  psrlq     xm2, 32             ; Zero extended high of a pair of 32 bits
-
-  pblendw   xm1, xm5, xm0, 0x33 ; Zero extended  low of a pair of 32 bits
-  psrlq     xm0, 32             ; Zero extended high of a pair of 32 bits
-
-  paddq     xm2, xm3
-  paddq     xm0, xm1
-
-  psrldq    xm3, xm2, 8
-  psrldq    xm1, xm0, 8
-
-  paddq     xm2, xm3
-  paddq     xm0, xm1
-
-  ; Store the return value
-%if ARCH_X86_64
-  movq      rax, xm0
-  movq   [sszq], xm2
-%else
-  movd      eax, xm0
-  pextrd    edx, xm0, 1
-  movq   [sszd], xm2
-%endif
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of size != 16, speculative low precision
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ALIGN 16
-.generic:
-  pxor      xm4, xm4                ; sse accumulator
-  pxor      xm5, xm5                ; overflow detection register for xm4
-  pxor      xm6, xm6                ; ssz accumulator
-  pxor      xm7, xm7                ; overflow detection register for xm6
-  lea      uqcq, [uqcq+sizeq*4]
-  lea      dqcq, [dqcq+sizeq*4]
-  neg     sizeq
-
-  ; Push the negative size as the high precision code might need it
-  push    sizeq
-
-.loop:
-  ; Load input vectors
-  mova      xm0, [dqcq+sizeq*4]
-  packssdw  xm0, [dqcq+sizeq*4+16]
-  mova      xm2, [uqcq+sizeq*4]
-  packssdw  xm2, [uqcq+sizeq*4+16]
-
-  mova      xm1, [dqcq+sizeq*4+32]
-  packssdw  xm1, [dqcq+sizeq*4+48]
-  mova      xm3, [uqcq+sizeq*4+32]
-  packssdw  xm3, [uqcq+sizeq*4+48]
-
-  add     sizeq, 16
-
-  ; Compute the squared errors.
-  ; Individual errors are max 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
-  psubw     xm0, xm2
-  pmaddwd   xm2, xm2
-  pmaddwd   xm0, xm0
-
-  psubw     xm1, xm3
-  pmaddwd   xm3, xm3
-  pmaddwd   xm1, xm1
-
-  ; Squares are always positive, so we can use unsigned arithmetic after
-  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
-  ; fit in 32bits
-  paddd     xm2, xm3
-  paddd     xm0, xm1
-
-  ; We accumulate using 32 bit arithmetic, but detect potential overflow
-  ; by checking if the MSB of the accumulators have ever been a set bit.
-  ; If yes, we redo the whole compute at the end on higher precision, but
-  ; this happens extremely rarely, so we still achieve a net gain.
-  paddd     xm4, xm0
-  paddd     xm6, xm2
-  por       xm5, xm4  ; OR in the accumulator for overflow detection
-  por       xm7, xm6  ; OR in the accumulator for overflow detection
-
-  jnz .loop
-
-  ; Add pairs horizontally (still only on 32 bits)
-  phaddd    xm4, xm4
-  por       xm5, xm4  ; OR in the accumulator for overflow detection
-  phaddd    xm6, xm6
-  por       xm7, xm6  ; OR in the accumulator for overflow detection
-
-  ; Check for possibility of overflow by testing if bit 32 of each dword lane
-  ; have ever been set. If they were not, then there was no overflow and the
-  ; final sum will fit in 32 bits. If overflow happened, then
-  ; we redo the whole computation on higher precision.
-  por       xm7, xm5
-  pmovmskb   r4, xm7
-  test       r4, 0x8888
-  jnz .highprec
-
-  phaddd    xm4, xm4
-  phaddd    xm6, xm6
-  pmovzxdq  xm4, xm4
-  pmovzxdq  xm6, xm6
-
-  ; Restore stack
-  pop     sizeq
-
-  ; Store the return value
-%if ARCH_X86_64
-  movq      rax, xm4
-  movq   [sszq], xm6
-%else
-  movd      eax, xm4
-  pextrd    edx, xm4, 1
-  movq   [sszd], xm6
-%endif
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of size != 16, high precision case
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-.highprec:
-  pxor      xm4, xm4                 ; sse accumulator
-  pxor      xm5, xm5                 ; dedicated zero register
-  pxor      xm6, xm6                 ; ssz accumulator
-  pop     sizeq
-
-.loophp:
-  mova      xm0, [dqcq+sizeq*4]
-  packssdw  xm0, [dqcq+sizeq*4+16]
-  mova      xm2, [uqcq+sizeq*4]
-  packssdw  xm2, [uqcq+sizeq*4+16]
-
-  mova      xm1, [dqcq+sizeq*4+32]
-  packssdw  xm1, [dqcq+sizeq*4+48]
-  mova      xm3, [uqcq+sizeq*4+32]
-  packssdw  xm3, [uqcq+sizeq*4+48]
-
-  add     sizeq, 16
-
-  ; individual errors are max. 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-
-  psubw     xm0, xm2
-  pmaddwd   xm2, xm2
-  pmaddwd   xm0, xm0
-
-  psubw     xm1, xm3
-  pmaddwd   xm3, xm3
-  pmaddwd   xm1, xm1
-
-  ; accumulate in 64bit
-  punpckldq xm7, xm0, xm5
-  punpckhdq xm0, xm5
-  paddq     xm4, xm7
-
-  punpckldq xm7, xm2, xm5
-  punpckhdq xm2, xm5
-  paddq     xm6, xm7
-
-  punpckldq xm7, xm1, xm5
-  punpckhdq xm1, xm5
-  paddq     xm4, xm7
-
-  punpckldq xm7, xm3, xm5
-  punpckhdq xm3, xm5
-  paddq     xm6, xm7
-
-  paddq     xm4, xm0
-  paddq     xm4, xm1
-  paddq     xm6, xm2
-  paddq     xm6, xm3
-
-  jnz .loophp
-
-  ; Accumulate horizontally
-  movhlps   xm5, xm4
-  movhlps   xm7, xm6
-  paddq     xm4, xm5
-  paddq     xm6, xm7
-
-  ; Store the return value
-%if ARCH_X86_64
-  movq      rax, xm4
-  movq   [sszq], xm6
-%else
-  movd      eax, xm4
-  pextrd    edx, xm4, 1
-  movq   [sszd], xm6
-%endif
-  RET
-
-END
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm b/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm
deleted file mode 100644
index f3b8f01947..0000000000
--- a/libs/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm
+++ /dev/null
@@ -1,98 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-ALIGN 16
-
-;
-; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
-;                                     intptr_t block_size, int64_t *ssz)
-;
-
-INIT_XMM sse2
-cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz
-  pxor      m4, m4                 ; sse accumulator
-  pxor      m6, m6                 ; ssz accumulator
-  pxor      m5, m5                 ; dedicated zero register
-  lea     uqcq, [uqcq+sizeq*4]
-  lea     dqcq, [dqcq+sizeq*4]
-  neg    sizeq
-
-  ALIGN 16
-
-.loop:
-  mova      m0, [dqcq+sizeq*4]
-  packssdw  m0, [dqcq+sizeq*4+mmsize]
-  mova      m2, [uqcq+sizeq*4]
-  packssdw  m2, [uqcq+sizeq*4+mmsize]
-
-  mova      m1, [dqcq+sizeq*4+mmsize*2]
-  packssdw  m1, [dqcq+sizeq*4+mmsize*3]
-  mova      m3, [uqcq+sizeq*4+mmsize*2]
-  packssdw  m3, [uqcq+sizeq*4+mmsize*3]
-
-  add    sizeq, mmsize
-
-  ; individual errors are max. 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-
-  psubw     m0, m2
-  pmaddwd   m2, m2
-  pmaddwd   m0, m0
-
-  psubw     m1, m3
-  pmaddwd   m3, m3
-  pmaddwd   m1, m1
-
-  ; accumulate in 64bit
-  punpckldq m7, m0, m5
-  punpckhdq m0, m5
-  paddq     m4, m7
-
-  punpckldq m7, m2, m5
-  punpckhdq m2, m5
-  paddq     m6, m7
-
-  punpckldq m7, m1, m5
-  punpckhdq m1, m5
-  paddq     m4, m7
-
-  punpckldq m7, m3, m5
-  punpckhdq m3, m5
-  paddq     m6, m7
-
-  paddq     m4, m0
-  paddq     m4, m1
-  paddq     m6, m2
-  paddq     m6, m3
-
-  jnz .loop
-
-  ; accumulate horizontally and store in return value
-  movhlps   m5, m4
-  movhlps   m7, m6
-  paddq     m4, m5
-  paddq     m6, m7
-
-%if ARCH_X86_64
-  movq    rax, m4
-  movq [sszq], m6
-%else
-  mov     eax, sszm
-  pshufd   m5, m4, 0x1
-  movq  [eax], m6
-  movd    eax, m4
-  movd    edx, m5
-%endif
-  RET
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
index 3f8ee5f244..ca0ad4407e 100644
--- a/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -8,25 +8,30 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
 #include "./vp9_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 
-void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *zbin_ptr,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
-                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t *round_ptr,
+                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                           uint16_t *eob_ptr, const int16_t *scan_ptr,
                           const int16_t *iscan_ptr) {
   __m128i zero;
   __m128i thr;
   int16_t nzflag;
+  __m128i eob;
+  __m128i round, quant, dequant;
+
   (void)scan_ptr;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
+  (void)skip_block;
+  assert(!skip_block);
 
   coeff_ptr += n_coeffs;
   iscan_ptr += n_coeffs;
@@ -35,40 +40,106 @@ void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
   n_coeffs = -n_coeffs;
   zero = _mm_setzero_si128();
 
-  if (!skip_block) {
-    __m128i eob;
-    __m128i round, quant, dequant;
+  {
+    __m128i coeff0, coeff1;
+
+    // Setup global values
     {
-      __m128i coeff0, coeff1;
+      round = _mm_load_si128((const __m128i *)round_ptr);
+      quant = _mm_load_si128((const __m128i *)quant_ptr);
+      dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+    }
 
-      // Setup global values
-      {
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      }
+    {
+      __m128i coeff0_sign, coeff1_sign;
+      __m128i qcoeff0, qcoeff1;
+      __m128i qtmp0, qtmp1;
+      // Do DC and first 15 AC
+      coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
 
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        // Do DC and first 15 AC
-        coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
+      // Poor man's sign extract
+      coeff0_sign = _mm_srai_epi16(coeff0, 15);
+      coeff1_sign = _mm_srai_epi16(coeff1, 15);
+      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      round = _mm_unpackhi_epi64(round, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+      quant = _mm_unpackhi_epi64(quant, quant);
+      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
 
+      // Reinsert signs
+      qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+      qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+
+      coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      dequant = _mm_unpackhi_epi64(dequant, dequant);
+      coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
+    }
+
+    {
+      // Scan for eob
+      __m128i zero_coeff0, zero_coeff1;
+      __m128i nzero_coeff0, nzero_coeff1;
+      __m128i iscan0, iscan1;
+      __m128i eob1;
+      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+      // Add one to convert from indices to counts
+      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+      eob = _mm_and_si128(iscan0, nzero_coeff0);
+      eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+      eob = _mm_max_epi16(eob, eob1);
+    }
+    n_coeffs += 8 * 2;
+  }
+
+  thr = _mm_srai_epi16(dequant, 1);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    __m128i coeff0, coeff1;
+    {
+      __m128i coeff0_sign, coeff1_sign;
+      __m128i qcoeff0, qcoeff1;
+      __m128i qtmp0, qtmp1;
+
+      coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+
+      // Poor man's sign extract
+      coeff0_sign = _mm_srai_epi16(coeff0, 15);
+      coeff1_sign = _mm_srai_epi16(coeff1, 15);
+      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+      nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+               _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+      if (nzflag) {
         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
 
         // Reinsert signs
@@ -77,134 +148,55 @@ void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
+        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
+      } else {
+        store_zero_tran_low(qcoeff_ptr + n_coeffs);
+        store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
 
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
+        store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+        store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
       }
-      n_coeffs += 8 * 2;
     }
 
-    thr = _mm_srai_epi16(dequant, 1);
-
-    // AC only loop
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-
-        coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
-                 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
-        if (nzflag) {
-          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-          // Reinsert signs
-          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-        } else {
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
-
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-        }
-      }
-
-      if (nzflag) {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
+    if (nzflag) {
+      // Scan for eob
+      __m128i zero_coeff0, zero_coeff1;
+      __m128i nzero_coeff0, nzero_coeff1;
+      __m128i iscan0, iscan1;
+      __m128i eob0, eob1;
+      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+      // Add one to convert from indices to counts
+      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+      eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+      eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+      eob0 = _mm_max_epi16(eob0, eob1);
+      eob = _mm_max_epi16(eob, eob0);
     }
+    n_coeffs += 8 * 2;
+  }
 
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
+  // Accumulate EOB
+  {
+    __m128i eob_shuffled;
+    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    *eob_ptr = _mm_extract_epi16(eob, 1);
   }
 }
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/libs/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index ec61c0c3a7..5703aa3bb6 100644
--- a/libs/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/libs/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -11,6 +11,7 @@
 %define private_prefix vp9
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION_RODATA
 pw_1: times 8 dw 1
@@ -18,17 +19,14 @@ pw_1: times 8 dw 1
 SECTION .text
 
 %macro QUANTIZE_FP 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \
+                                qcoeff, dqcoeff, dequant, \
                                 eob, scan, iscan
-  cmp                    dword skipm, 0
-  jne .blank
 
   ; actual quantize loop - setup pointers, rounders, etc.
   movifnidn                   coeffq, coeffmp
   movifnidn                  ncoeffq, ncoeffmp
   mov                             r2, dequantmp
-  movifnidn                    zbinq, zbinmp
   movifnidn                   roundq, roundmp
   movifnidn                   quantq, quantmp
   mova                            m1, [roundq]             ; m1 = round
@@ -48,15 +46,15 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
 
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                            r5q, [  r5q+ncoeffq*2]
-  lea                            r3q, [ r3q+ncoeffq*2]
-  lea                            r4q, [r4q+ncoeffq*2]
+  INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq
+  lea                            r5q, [r5q+ncoeffq*2]
+  INCREMENT_ELEMENTS_TRAN_LOW    r3q, ncoeffq
+  INCREMENT_ELEMENTS_TRAN_LOW    r4q, ncoeffq
   neg                        ncoeffq
 
   ; get DC and first 15 AC coeffs
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]
+  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
   pcmpeqw                         m7, m7
@@ -69,8 +67,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   psignw                          m8, m9                   ; m8 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
-  mova            [r3q+ncoeffq*2+ 0], m8
-  mova            [r3q+ncoeffq*2+16], m13
+  STORE_TRAN_LOW  8, r3q, ncoeffq,     6, 11, 12
+  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
 %ifidn %1, fp_32x32
   pabsw                           m8, m8
   pabsw                          m13, m13
@@ -87,8 +85,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %else
   psrlw                           m0, m3, 1
 %endif
-  mova            [r4q+ncoeffq*2+ 0], m8
-  mova            [r4q+ncoeffq*2+16], m13
+  STORE_TRAN_LOW  8, r4q, ncoeffq,     6, 11, 12
+  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
   pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
@@ -102,8 +100,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   jz .accumulate_eob
 
 .ac_only_loop:
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]
+  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
 
@@ -123,8 +121,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   psignw                         m14, m9                   ; m14 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
-  mova            [r3q+ncoeffq*2+ 0], m14
-  mova            [r3q+ncoeffq*2+16], m13
+  STORE_TRAN_LOW 14, r3q, ncoeffq,     6, 11, 12
+  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
 %ifidn %1, fp_32x32
   pabsw                          m14, m14
   pabsw                          m13, m13
@@ -137,8 +135,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                         m14, m9
   psignw                         m13, m10
 %endif
-  mova            [r4q+ncoeffq*2+ 0], m14
-  mova            [r4q+ncoeffq*2+16], m13
+  STORE_TRAN_LOW 14, r4q, ncoeffq,     6, 11, 12
+  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
   pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
@@ -154,10 +152,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 
   jmp .accumulate_eob
 .skip_iter:
-  mova            [r3q+ncoeffq*2+ 0], m5
-  mova            [r3q+ncoeffq*2+16], m5
-  mova            [r4q+ncoeffq*2+ 0], m5
-  mova            [r4q+ncoeffq*2+16], m5
+  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq
+  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8
+  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq
+  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8
   add                        ncoeffq, mmsize
   jl .ac_only_loop
 
@@ -171,28 +169,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pshuflw                         m7, m8, 0x1
   pmaxsw                          m8, m7
   pextrw                          r6, m8, 0
-  mov                           [r2], r6
-  RET
-
-  ; skip-block, i.e. just write all zeroes
-.blank:
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-
-  lea                            r0q, [r0q+ncoeffq*2]
-  lea                            r2q, [r2q+ncoeffq*2]
-  neg                        ncoeffq
-  pxor                            m7, m7
-.blank_loop:
-  mova            [r0q+ncoeffq*2+ 0], m7
-  mova            [r0q+ncoeffq*2+16], m7
-  mova            [r2q+ncoeffq*2+ 0], m7
-  mova            [r2q+ncoeffq*2+16], m7
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-  mov                     word [r3q], 0
+  mov                           [r2], r6w
   RET
 %endmacro
 
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/libs/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
deleted file mode 100644
index 21aaa93831..0000000000
--- a/libs/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
+++ /dev/null
@@ -1,212 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; void vp9_temporal_filter_apply_sse2 | arg
-;  (unsigned char  *frame1,           |  0
-;   unsigned int    stride,           |  1
-;   unsigned char  *frame2,           |  2
-;   unsigned int    block_width,      |  3
-;   unsigned int    block_height,     |  4
-;   int             strength,         |  5
-;   int             filter_weight,    |  6
-;   unsigned int   *accumulator,      |  7
-;   unsigned short *count)            |  8
-global sym(vp9_temporal_filter_apply_sse2) PRIVATE
-sym(vp9_temporal_filter_apply_sse2):
-
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ALIGN_STACK 16, rax
-    %define block_width    0
-    %define block_height  16
-    %define strength      32
-    %define filter_weight 48
-    %define rounding_bit  64
-    %define rbp_backup    80
-    %define stack_size    96
-    sub         rsp,           stack_size
-    mov         [rsp + rbp_backup], rbp
-    ; end prolog
-
-        mov         edx,            arg(3)
-        mov         [rsp + block_width], rdx
-        mov         edx,            arg(4)
-        mov         [rsp + block_height], rdx
-        movd        xmm6,           arg(5)
-        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
-
-        ; calculate the rounding bit outside the loop
-        ; 0x8000 >> (16 - strength)
-        mov         rdx,            16
-        sub         rdx,            arg(5) ; 16 - strength
-        movq        xmm4,           rdx    ; can't use rdx w/ shift
-        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
-        psrlw       xmm5,           xmm4
-        movdqa      [rsp + rounding_bit], xmm5
-
-        mov         rsi,            arg(0) ; src/frame1
-        mov         rdx,            arg(2) ; predictor frame
-        mov         rdi,            arg(7) ; accumulator
-        mov         rax,            arg(8) ; count
-
-        ; dup the filter weight and store for later
-        movd        xmm0,           arg(6) ; filter_weight
-        pshuflw     xmm0,           xmm0, 0
-        punpcklwd   xmm0,           xmm0
-        movdqa      [rsp + filter_weight], xmm0
-
-        mov         rbp,            arg(1) ; stride
-        pxor        xmm7,           xmm7   ; zero for extraction
-
-        mov         rcx,            [rsp + block_width]
-        imul        rcx,            [rsp + block_height]
-        add         rcx,            rdx
-        cmp         dword ptr [rsp + block_width], 8
-        jne         .temporal_filter_apply_load_16
-
-.temporal_filter_apply_load_8:
-        movq        xmm0,           [rsi]  ; first row
-        lea         rsi,            [rsi + rbp] ; += stride
-        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
-        movq        xmm1,           [rsi]  ; second row
-        lea         rsi,            [rsi + rbp] ; += stride
-        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
-        jmp         .temporal_filter_apply_load_finished
-
-.temporal_filter_apply_load_16:
-        movdqa      xmm0,           [rsi]  ; src (frame1)
-        lea         rsi,            [rsi + rbp] ; += stride
-        movdqa      xmm1,           xmm0
-        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
-        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
-
-.temporal_filter_apply_load_finished:
-        movdqa      xmm2,           [rdx]  ; predictor (frame2)
-        movdqa      xmm3,           xmm2
-        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
-        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
-
-        ; modifier = src_byte - pixel_value
-        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
-        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
-
-        ; modifier *= modifier
-        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
-        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
-
-        ; modifier *= 3
-        pmullw      xmm0,           [GLOBAL(_const_3w)]
-        pmullw      xmm1,           [GLOBAL(_const_3w)]
-
-        ; modifer += 0x8000 >> (16 - strength)
-        paddw       xmm0,           [rsp + rounding_bit]
-        paddw       xmm1,           [rsp + rounding_bit]
-
-        ; modifier >>= strength
-        psrlw       xmm0,           [rsp + strength]
-        psrlw       xmm1,           [rsp + strength]
-
-        ; modifier = 16 - modifier
-        ; saturation takes care of modifier > 16
-        movdqa      xmm3,           [GLOBAL(_const_16w)]
-        movdqa      xmm2,           [GLOBAL(_const_16w)]
-        psubusw     xmm3,           xmm1
-        psubusw     xmm2,           xmm0
-
-        ; modifier *= filter_weight
-        pmullw      xmm2,           [rsp + filter_weight]
-        pmullw      xmm3,           [rsp + filter_weight]
-
-        ; count
-        movdqa      xmm4,           [rax]
-        movdqa      xmm5,           [rax+16]
-        ; += modifier
-        paddw       xmm4,           xmm2
-        paddw       xmm5,           xmm3
-        ; write back
-        movdqa      [rax],          xmm4
-        movdqa      [rax+16],       xmm5
-        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
-
-        ; load and extract the predictor up to shorts
-        pxor        xmm7,           xmm7
-        movdqa      xmm0,           [rdx]
-        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
-        movdqa      xmm1,           xmm0
-        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
-        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
-
-        ; modifier *= pixel_value
-        pmullw      xmm0,           xmm2
-        pmullw      xmm1,           xmm3
-
-        ; expand to double words
-        movdqa      xmm2,           xmm0
-        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
-        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
-        movdqa      xmm3,           xmm1
-        punpcklwd   xmm1,           xmm7   ; [ 8-11]
-        punpckhwd   xmm3,           xmm7   ; [12-15]
-
-        ; accumulator
-        movdqa      xmm4,           [rdi]
-        movdqa      xmm5,           [rdi+16]
-        movdqa      xmm6,           [rdi+32]
-        movdqa      xmm7,           [rdi+48]
-        ; += modifier
-        paddd       xmm4,           xmm0
-        paddd       xmm5,           xmm2
-        paddd       xmm6,           xmm1
-        paddd       xmm7,           xmm3
-        ; write back
-        movdqa      [rdi],          xmm4
-        movdqa      [rdi+16],       xmm5
-        movdqa      [rdi+32],       xmm6
-        movdqa      [rdi+48],       xmm7
-        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
-
-        cmp         rdx,            rcx
-        je          .temporal_filter_apply_epilog
-        pxor        xmm7,           xmm7   ; zero for extraction
-        cmp         dword ptr [rsp + block_width], 16
-        je          .temporal_filter_apply_load_16
-        jmp         .temporal_filter_apply_load_8
-
-.temporal_filter_apply_epilog:
-    ; begin epilog
-    mov         rbp,            [rsp + rbp_backup]
-    add         rsp,            stack_size
-    pop         rsp
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-_const_3w:
-    times 8 dw 3
-align 16
-_const_top_bit:
-    times 8 dw 1<<15
-align 16
-_const_16w
-    times 8 dw 16
diff --git a/libs/libvpx/vp9/vp9_cx_iface.c b/libs/libvpx/vp9/vp9_cx_iface.c
index a797b2c262..881caae78b 100644
--- a/libs/libvpx/vp9/vp9_cx_iface.c
+++ b/libs/libvpx/vp9/vp9_cx_iface.c
@@ -51,6 +51,8 @@ struct vp9_extracfg {
   vpx_color_range_t color_range;
   int render_width;
   int render_height;
+  unsigned int row_mt;
+  unsigned int motion_vector_unit_test;
 };
 
 static struct vp9_extracfg default_extra_cfg = {
@@ -82,6 +84,8 @@ static struct vp9_extracfg default_extra_cfg = {
   0,                     // color range
   0,                     // render width
   0,                     // render height
+  0,                     // row_mt
+  0,                     // motion_vector_unit_test
 };
 
 struct vpx_codec_alg_priv {
@@ -157,6 +161,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
   RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
   RANGE_CHECK_BOOL(extra_cfg, lossless);
+  RANGE_CHECK_BOOL(extra_cfg, frame_parallel_decoding_mode);
   RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2);
   RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1);
   RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1);
@@ -166,12 +171,17 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
   RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
   RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+  RANGE_CHECK(cfg, rc_2pass_vbr_corpus_complexity, 0, 10000);
   RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO);
   RANGE_CHECK_BOOL(cfg, rc_resize_allowed);
   RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
   RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100);
   RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
+#if CONFIG_REALTIME_ONLY
+  RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_ONE_PASS);
+#else
   RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
+#endif
   RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
   RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
   if (extra_cfg->max_gf_interval > 0) {
@@ -182,6 +192,13 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
                 (MAX_LAG_BUFFERS - 1));
   }
 
+  // For formation of valid ARF groups lag_in _frames should be 0 or greater
+  // than the max_gf_interval + 2
+  if (cfg->g_lag_in_frames > 0 && extra_cfg->max_gf_interval > 0 &&
+      cfg->g_lag_in_frames < extra_cfg->max_gf_interval + 2) {
+    ERROR("Set lag in frames to 0 (low delay) or >= (max-gf-interval + 2)");
+  }
+
   if (cfg->rc_resize_allowed == 1) {
     RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w);
     RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h);
@@ -197,7 +214,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
         level != LEVEL_4 && level != LEVEL_4_1 && level != LEVEL_5 &&
         level != LEVEL_5_1 && level != LEVEL_5_2 && level != LEVEL_6 &&
         level != LEVEL_6_1 && level != LEVEL_6_2 && level != LEVEL_UNKNOWN &&
-        level != LEVEL_MAX)
+        level != LEVEL_AUTO && level != LEVEL_MAX)
       ERROR("target_level is invalid");
   }
 
@@ -244,6 +261,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
         "kf_min_dist not supported in auto mode, use 0 "
         "or kf_max_dist instead.");
 
+  RANGE_CHECK(extra_cfg, row_mt, 0, 1);
+  RANGE_CHECK(extra_cfg, motion_vector_unit_test, 0, 2);
   RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
   RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
@@ -262,6 +281,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   if (extra_cfg->tuning == VP8_TUNE_SSIM)
     ERROR("Option --tune=ssim is not currently supported in VP9.");
 
+#if !CONFIG_REALTIME_ONLY
   if (cfg->g_pass == VPX_RC_LAST_PASS) {
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
     const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
@@ -313,6 +333,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
         ERROR("rc_twopass_stats_in missing EOS stats packet");
     }
   }
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if !CONFIG_VP9_HIGHBITDEPTH
   if (cfg->g_profile > (unsigned int)PROFILE_1) {
@@ -389,6 +410,60 @@ static int get_image_bps(const vpx_image_t *img) {
   return 0;
 }
 
+// Modify the encoder config for the target level.
+static void config_target_level(VP9EncoderConfig *oxcf) {
+  double max_average_bitrate;  // in bits per second
+  int max_over_shoot_pct;
+  const int target_level_index = get_level_index(oxcf->target_level);
+
+  vpx_clear_system_state();
+  assert(target_level_index >= 0);
+  assert(target_level_index < VP9_LEVELS);
+
+  // Maximum target bit-rate is level_limit * 80%.
+  max_average_bitrate =
+      vp9_level_defs[target_level_index].average_bitrate * 800.0;
+  if ((double)oxcf->target_bandwidth > max_average_bitrate)
+    oxcf->target_bandwidth = (int64_t)(max_average_bitrate);
+  if (oxcf->ss_number_layers == 1 && oxcf->pass != 0)
+    oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
+
+  // Adjust max over-shoot percentage.
+  max_over_shoot_pct =
+      (int)((max_average_bitrate * 1.10 - (double)oxcf->target_bandwidth) *
+            100 / (double)(oxcf->target_bandwidth));
+  if (oxcf->over_shoot_pct > max_over_shoot_pct)
+    oxcf->over_shoot_pct = max_over_shoot_pct;
+
+  // Adjust worst allowed quantizer.
+  oxcf->worst_allowed_q = vp9_quantizer_to_qindex(63);
+
+  // Adjust minimum art-ref distance.
+  // min_gf_interval should be no less than min_altref_distance + 1,
+  // as the encoder may produce bitstream with alt-ref distance being
+  // min_gf_interval - 1.
+  if (oxcf->min_gf_interval <=
+      (int)vp9_level_defs[target_level_index].min_altref_distance) {
+    oxcf->min_gf_interval =
+        (int)vp9_level_defs[target_level_index].min_altref_distance + 1;
+    // If oxcf->max_gf_interval == 0, it will be assigned with a default value
+    // in vp9_rc_set_gf_interval_range().
+    if (oxcf->max_gf_interval != 0) {
+      oxcf->max_gf_interval =
+          VPXMAX(oxcf->max_gf_interval, oxcf->min_gf_interval);
+    }
+  }
+
+  // Adjust maximum column tiles.
+  if (vp9_level_defs[target_level_index].max_col_tiles <
+      (1 << oxcf->tile_columns)) {
+    while (oxcf->tile_columns > 0 &&
+           vp9_level_defs[target_level_index].max_col_tiles <
+               (1 << oxcf->tile_columns))
+      --oxcf->tile_columns;
+  }
+}
+
 static vpx_codec_err_t set_encoder_config(
     VP9EncoderConfig *oxcf, const vpx_codec_enc_cfg_t *cfg,
     const struct vp9_extracfg *extra_cfg) {
@@ -452,6 +527,7 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct;
   oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct;
   oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct;
+  oxcf->vbr_corpus_complexity = cfg->rc_2pass_vbr_corpus_complexity;
 
   oxcf->auto_key =
       cfg->kf_mode == VPX_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist;
@@ -509,6 +585,9 @@ static vpx_codec_err_t set_encoder_config(
 
   oxcf->target_level = extra_cfg->target_level;
 
+  oxcf->row_mt = extra_cfg->row_mt;
+  oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
+
   for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
 #if CONFIG_SPATIAL_SVC
     oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl];
@@ -532,6 +611,8 @@ static vpx_codec_err_t set_encoder_config(
   } else if (oxcf->ts_number_layers == 1) {
     oxcf->ts_rate_decimator[0] = 1;
   }
+
+  if (get_level_index(oxcf->target_level) >= 0) config_target_level(oxcf);
   /*
   printf("Current VP9 Settings: \n");
   printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
@@ -557,6 +638,7 @@ static vpx_codec_err_t set_encoder_config(
   printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);
   printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
   printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
+  printf("vbr_corpus_complexity: %d\n",  oxcf->vbr_corpus_complexity);
   printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
   printf("enable_auto_arf: %d\n", oxcf->enable_auto_arf);
   printf("Version: %d\n", oxcf->Version);
@@ -795,6 +877,21 @@ static vpx_codec_err_t ctrl_set_target_level(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.row_mt = CAST(VP9E_SET_ROW_MT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_enable_motion_vector_unit_test(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.motion_vector_unit_test =
+      CAST(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t ctrl_get_level(vpx_codec_alg_priv_t *ctx, va_list args) {
   int *const arg = va_arg(args, int *);
   if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
@@ -817,12 +914,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
     priv->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
     if (priv->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR;
 
-#if CONFIG_MULTITHREAD
-    if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) {
-      return VPX_CODEC_MEM_ERROR;
-    }
-#endif
-
     if (ctx->config.enc) {
       // Update the reference to the config structure to an internal copy.
       priv->cfg = *ctx->config.enc;
@@ -854,9 +945,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
 static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
   free(ctx->cx_data);
   vp9_remove_compressor(ctx->cpi);
-#if CONFIG_MULTITHREAD
-  pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
-#endif
   vpx_free(ctx->buffer_pool);
   vpx_free(ctx);
   return VPX_CODEC_OK;
@@ -867,6 +955,10 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
                                     unsigned long deadline) {
   MODE new_mode = BEST;
 
+#if CONFIG_REALTIME_ONLY
+  (void)duration;
+  deadline = VPX_DL_REALTIME;
+#else
   switch (ctx->cfg.g_pass) {
     case VPX_RC_ONE_PASS:
       if (deadline > 0) {
@@ -887,6 +979,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
     case VPX_RC_FIRST_PASS: break;
     case VPX_RC_LAST_PASS: new_mode = deadline > 0 ? GOOD : BEST; break;
   }
+#endif  // CONFIG_REALTIME_ONLY
 
   if (deadline == VPX_DL_REALTIME) {
     ctx->oxcf.pass = 0;
@@ -1002,6 +1095,28 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
   if (cpi == NULL) return VPX_CODEC_INVALID_PARAM;
 
+  if (cpi->oxcf.pass == 2 && cpi->level_constraint.level_index >= 0 &&
+      !cpi->level_constraint.rc_config_updated) {
+    SVC *const svc = &cpi->svc;
+    const int is_two_pass_svc =
+        (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1);
+    const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+    TWO_PASS *const twopass = &cpi->twopass;
+    FIRSTPASS_STATS *stats = &twopass->total_stats;
+    if (is_two_pass_svc) {
+      const double frame_rate = 10000000.0 * stats->count / stats->duration;
+      vp9_update_spatial_layer_framerate(cpi, frame_rate);
+      twopass->bits_left =
+          (int64_t)(stats->duration *
+                    svc->layer_context[svc->spatial_layer_id].target_bandwidth /
+                    10000000.0);
+    } else {
+      twopass->bits_left =
+          (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
+    }
+    cpi->level_constraint.rc_config_updated = 1;
+  }
+
   if (img != NULL) {
     res = validate_img(ctx, img);
     if (res == VPX_CODEC_OK) {
@@ -1039,7 +1154,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
   }
   cpi->common.error.setjmp = 1;
 
-  vp9_apply_encoding_flags(cpi, flags);
+  if (res == VPX_CODEC_OK) vp9_apply_encoding_flags(cpi, flags);
 
   // Handle fixed keyframe intervals
   if (ctx->cfg.kf_mode == VPX_KF_AUTO &&
@@ -1173,8 +1288,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
         cx_data += size;
         cx_data_sz -= size;
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
-#if CONFIG_SPATIAL_SVC
+#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC)
         if (cpi->use_svc && !ctx->output_cx_pkt_cb.output_cx_pkt) {
           vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr;
           int sl;
@@ -1194,7 +1308,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
           vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr);
         }
-#endif
 #endif
         if (is_one_pass_cbr_svc(cpi) &&
             (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
@@ -1367,6 +1480,9 @@ static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) {
       cfg->ss_number_layers > 1 && cfg->ts_number_layers > 1) {
     return VPX_CODEC_INVALID_PARAM;
   }
+
+  vp9_set_row_mt(ctx->cpi);
+
   return VPX_CODEC_OK;
 }
 
@@ -1525,6 +1641,8 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config },
   { VP9E_SET_RENDER_SIZE, ctrl_set_render_size },
   { VP9E_SET_TARGET_LEVEL, ctrl_set_target_level },
+  { VP9E_SET_ROW_MT, ctrl_set_row_mt },
+  { VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
 
   // Getters
   { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },
@@ -1581,6 +1699,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         50,    // rc_two_pass_vbrbias
         0,     // rc_two_pass_vbrmin_section
         2000,  // rc_two_pass_vbrmax_section
+        0,     // rc_2pass_vbr_corpus_complexity (non 0 for corpus vbr)
 
         // keyframing settings (kf)
         VPX_KF_AUTO,  // g_kfmode
diff --git a/libs/libvpx/vp9/vp9_dx_iface.c b/libs/libvpx/vp9/vp9_dx_iface.c
index 3b5dc3ddac..657490f4bd 100644
--- a/libs/libvpx/vp9/vp9_dx_iface.c
+++ b/libs/libvpx/vp9/vp9_dx_iface.c
@@ -47,12 +47,6 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
     ctx->priv->init_flags = ctx->init_flags;
     priv->si.sz = sizeof(priv->si);
     priv->flushed = 0;
-    // Only do frame parallel decode when threads > 1.
-    priv->frame_parallel_decode =
-        (ctx->config.dec && (ctx->config.dec->threads > 1) &&
-         (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING))
-            ? 1
-            : 0;
     if (ctx->config.dec) {
       priv->cfg = *ctx->config.dec;
       ctx->config.dec = &priv->cfg;
@@ -63,33 +57,8 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
 }
 
 static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
-  if (ctx->frame_workers != NULL) {
-    int i;
-    // Shutdown all threads before reclaiming any memory. The frame-level
-    // parallel decoder may access data from another worker.
-    for (i = 0; i < ctx->num_frame_workers; ++i) {
-      VPxWorker *const worker = &ctx->frame_workers[i];
-      vpx_get_worker_interface()->end(worker);
-    }
-    for (i = 0; i < ctx->num_frame_workers; ++i) {
-      VPxWorker *const worker = &ctx->frame_workers[i];
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      vp9_remove_common(&frame_worker_data->pbi->common);
-#if CONFIG_VP9_POSTPROC
-      vp9_free_postproc_buffers(&frame_worker_data->pbi->common);
-#endif
-      vp9_decoder_remove(frame_worker_data->pbi);
-      vpx_free(frame_worker_data->scratch_buffer);
-#if CONFIG_MULTITHREAD
-      pthread_mutex_destroy(&frame_worker_data->stats_mutex);
-      pthread_cond_destroy(&frame_worker_data->stats_cond);
-#endif
-      vpx_free(frame_worker_data);
-    }
-#if CONFIG_MULTITHREAD
-    pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
-#endif
+  if (ctx->pbi != NULL) {
+    vp9_decoder_remove(ctx->pbi);
   }
 
   if (ctx->buffer_pool) {
@@ -97,7 +66,6 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
     vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
   }
 
-  vpx_free(ctx->frame_workers);
   vpx_free(ctx->buffer_pool);
   vpx_free(ctx);
   return VPX_CODEC_OK;
@@ -231,32 +199,26 @@ static vpx_codec_err_t update_error_state(
 }
 
 static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
-  int i;
+  VP9_COMMON *const cm = &ctx->pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
 
-  for (i = 0; i < ctx->num_frame_workers; ++i) {
-    VPxWorker *const worker = &ctx->frame_workers[i];
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
-    BufferPool *const pool = cm->buffer_pool;
+  cm->new_fb_idx = INVALID_IDX;
+  cm->byte_alignment = ctx->byte_alignment;
+  cm->skip_loop_filter = ctx->skip_loop_filter;
 
-    cm->new_fb_idx = INVALID_IDX;
-    cm->byte_alignment = ctx->byte_alignment;
-    cm->skip_loop_filter = ctx->skip_loop_filter;
+  if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+    pool->get_fb_cb = ctx->get_ext_fb_cb;
+    pool->release_fb_cb = ctx->release_ext_fb_cb;
+    pool->cb_priv = ctx->ext_priv;
+  } else {
+    pool->get_fb_cb = vp9_get_frame_buffer;
+    pool->release_fb_cb = vp9_release_frame_buffer;
 
-    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
-      pool->get_fb_cb = ctx->get_ext_fb_cb;
-      pool->release_fb_cb = ctx->release_ext_fb_cb;
-      pool->cb_priv = ctx->ext_priv;
-    } else {
-      pool->get_fb_cb = vp9_get_frame_buffer;
-      pool->release_fb_cb = vp9_release_frame_buffer;
+    if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                         "Failed to initialize internal frame buffers");
 
-      if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
-        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                           "Failed to initialize internal frame buffers");
-
-      pool->cb_priv = &pool->int_frame_buffers;
-    }
+    pool->cb_priv = &pool->int_frame_buffers;
   }
 }
 
@@ -273,124 +235,21 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) {
   flags->noise_level = ctx->postproc_cfg.noise_level;
 }
 
-static int frame_worker_hook(void *arg1, void *arg2) {
-  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
-  const uint8_t *data = frame_worker_data->data;
-  (void)arg2;
-
-  frame_worker_data->result = vp9_receive_compressed_data(
-      frame_worker_data->pbi, frame_worker_data->data_size, &data);
-  frame_worker_data->data_end = data;
-
-  if (frame_worker_data->pbi->frame_parallel_decode) {
-    // In frame parallel decoding, a worker thread must successfully decode all
-    // the compressed data.
-    if (frame_worker_data->result != 0 ||
-        frame_worker_data->data + frame_worker_data->data_size - 1 > data) {
-      VPxWorker *const worker = frame_worker_data->pbi->frame_worker_owner;
-      BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool;
-      // Signal all the other threads that are waiting for this frame.
-      vp9_frameworker_lock_stats(worker);
-      frame_worker_data->frame_context_ready = 1;
-      lock_buffer_pool(pool);
-      frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
-      unlock_buffer_pool(pool);
-      frame_worker_data->pbi->need_resync = 1;
-      vp9_frameworker_signal_stats(worker);
-      vp9_frameworker_unlock_stats(worker);
-      return 0;
-    }
-  } else if (frame_worker_data->result != 0) {
-    // Check decode result in serial decode.
-    frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
-    frame_worker_data->pbi->need_resync = 1;
-  }
-  return !frame_worker_data->result;
-}
-
 static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
-  int i;
-  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-
   ctx->last_show_frame = -1;
-  ctx->next_submit_worker_id = 0;
-  ctx->last_submit_worker_id = 0;
-  ctx->next_output_worker_id = 0;
-  ctx->frame_cache_read = 0;
-  ctx->frame_cache_write = 0;
-  ctx->num_cache_frames = 0;
   ctx->need_resync = 1;
-  ctx->num_frame_workers =
-      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads : 1;
-  if (ctx->num_frame_workers > MAX_DECODE_THREADS)
-    ctx->num_frame_workers = MAX_DECODE_THREADS;
-  ctx->available_threads = ctx->num_frame_workers;
   ctx->flushed = 0;
 
   ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
   if (ctx->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR;
 
-#if CONFIG_MULTITHREAD
-  if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
-    set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+  ctx->pbi = vp9_decoder_create(ctx->buffer_pool);
+  if (ctx->pbi == NULL) {
+    set_error_detail(ctx, "Failed to allocate decoder");
     return VPX_CODEC_MEM_ERROR;
   }
-#endif
-
-  ctx->frame_workers = (VPxWorker *)vpx_malloc(ctx->num_frame_workers *
-                                               sizeof(*ctx->frame_workers));
-  if (ctx->frame_workers == NULL) {
-    set_error_detail(ctx, "Failed to allocate frame_workers");
-    return VPX_CODEC_MEM_ERROR;
-  }
-
-  for (i = 0; i < ctx->num_frame_workers; ++i) {
-    VPxWorker *const worker = &ctx->frame_workers[i];
-    FrameWorkerData *frame_worker_data = NULL;
-    winterface->init(worker);
-    worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
-    if (worker->data1 == NULL) {
-      set_error_detail(ctx, "Failed to allocate frame_worker_data");
-      return VPX_CODEC_MEM_ERROR;
-    }
-    frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
-    if (frame_worker_data->pbi == NULL) {
-      set_error_detail(ctx, "Failed to allocate frame_worker_data");
-      return VPX_CODEC_MEM_ERROR;
-    }
-    frame_worker_data->pbi->frame_worker_owner = worker;
-    frame_worker_data->worker_id = i;
-    frame_worker_data->scratch_buffer = NULL;
-    frame_worker_data->scratch_buffer_size = 0;
-    frame_worker_data->frame_context_ready = 0;
-    frame_worker_data->received_frame = 0;
-#if CONFIG_MULTITHREAD
-    if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
-      set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
-      return VPX_CODEC_MEM_ERROR;
-    }
-
-    if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
-      set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
-      return VPX_CODEC_MEM_ERROR;
-    }
-#endif
-    // If decoding in serial mode, FrameWorker thread could create tile worker
-    // thread or loopfilter thread.
-    frame_worker_data->pbi->max_threads =
-        (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
-
-    frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
-    frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
-    frame_worker_data->pbi->common.frame_parallel_decode =
-        ctx->frame_parallel_decode;
-    worker->hook = (VPxWorkerHook)frame_worker_hook;
-    if (!winterface->reset(worker)) {
-      set_error_detail(ctx, "Frame Worker thread creation failed");
-      return VPX_CODEC_MEM_ERROR;
-    }
-  }
+  ctx->pbi->max_threads = ctx->cfg.threads;
+  ctx->pbi->inv_tile_order = ctx->invert_tile_order;
 
   // If postprocessing was enabled by the application and a
   // configuration has not been provided, default it.
@@ -404,7 +263,7 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
 
 static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
                                 const VP9Decoder *const pbi) {
-  // Clear resync flag if worker got a key frame or intra only frame.
+  // Clear resync flag if the decoder got a key frame or intra only frame.
   if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
       (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME))
     ctx->need_resync = 0;
@@ -413,7 +272,6 @@ static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t **data, unsigned int data_sz,
                                   void *user_priv, int64_t deadline) {
-  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   (void)deadline;
 
   // Determine the stream parameters. Note that we rely on peek_si to
@@ -429,103 +287,25 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
     if (!ctx->si.is_kf && !is_intra_only) return VPX_CODEC_ERROR;
   }
 
-  if (!ctx->frame_parallel_decode) {
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->data = *data;
-    frame_worker_data->data_size = data_sz;
-    frame_worker_data->user_priv = user_priv;
-    frame_worker_data->received_frame = 1;
+  ctx->user_priv = user_priv;
 
-    // Set these even if already initialized.  The caller may have changed the
-    // decrypt config between frames.
-    frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
-    frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
+  // Set these even if already initialized.  The caller may have changed the
+  // decrypt config between frames.
+  ctx->pbi->decrypt_cb = ctx->decrypt_cb;
+  ctx->pbi->decrypt_state = ctx->decrypt_state;
 
-    worker->had_error = 0;
-    winterface->execute(worker);
-
-    // Update data pointer after decode.
-    *data = frame_worker_data->data_end;
-
-    if (worker->had_error)
-      return update_error_state(ctx, &frame_worker_data->pbi->common.error);
-
-    check_resync(ctx, frame_worker_data->pbi);
-  } else {
-    VPxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    // Copy context from last worker thread to next worker thread.
-    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
-      vp9_frameworker_copy_context(
-          &ctx->frame_workers[ctx->next_submit_worker_id],
-          &ctx->frame_workers[ctx->last_submit_worker_id]);
-
-    frame_worker_data->pbi->ready_for_new_data = 0;
-    // Copy the compressed data into worker's internal buffer.
-    // TODO(hkuang): Will all the workers allocate the same size
-    // as the size of the first intra frame be better? This will
-    // avoid too many deallocate and allocate.
-    if (frame_worker_data->scratch_buffer_size < data_sz) {
-      frame_worker_data->scratch_buffer =
-          (uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz);
-      if (frame_worker_data->scratch_buffer == NULL) {
-        set_error_detail(ctx, "Failed to reallocate scratch buffer");
-        return VPX_CODEC_MEM_ERROR;
-      }
-      frame_worker_data->scratch_buffer_size = data_sz;
-    }
-    frame_worker_data->data_size = data_sz;
-    memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
-
-    frame_worker_data->frame_decoded = 0;
-    frame_worker_data->frame_context_ready = 0;
-    frame_worker_data->received_frame = 1;
-    frame_worker_data->data = frame_worker_data->scratch_buffer;
-    frame_worker_data->user_priv = user_priv;
-
-    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
-      ctx->last_submit_worker_id =
-          (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
-
-    ctx->next_submit_worker_id =
-        (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
-    --ctx->available_threads;
-    worker->had_error = 0;
-    winterface->launch(worker);
+  if (vp9_receive_compressed_data(ctx->pbi, data_sz, data)) {
+    ctx->pbi->cur_buf->buf.corrupted = 1;
+    ctx->pbi->need_resync = 1;
+    ctx->need_resync = 1;
+    return update_error_state(ctx, &ctx->pbi->common.error);
   }
 
+  check_resync(ctx, ctx->pbi);
+
   return VPX_CODEC_OK;
 }
 
-static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
-  YV12_BUFFER_CONFIG sd;
-  vp9_ppflags_t flags = { 0, 0, 0 };
-  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-  VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
-  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-  ctx->next_output_worker_id =
-      (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
-  // TODO(hkuang): Add worker error handling here.
-  winterface->sync(worker);
-  frame_worker_data->received_frame = 0;
-  ++ctx->available_threads;
-
-  check_resync(ctx, frame_worker_data->pbi);
-
-  if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
-    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
-    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-    ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
-    yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
-                    frame_worker_data->user_priv);
-    ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
-        frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-    ctx->frame_cache_write = (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
-    ++ctx->num_cache_frames;
-  }
-}
-
 static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                       const uint8_t *data, unsigned int data_sz,
                                       void *user_priv, long deadline) {
@@ -543,8 +323,8 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
   // Reset flushed when receiving a valid frame.
   ctx->flushed = 0;
 
-  // Initialize the decoder workers on the first frame.
-  if (ctx->frame_workers == NULL) {
+  // Initialize the decoder on the first frame.
+  if (ctx->pbi == NULL) {
     const vpx_codec_err_t res = init_decoder(ctx);
     if (res != VPX_CODEC_OK) return res;
   }
@@ -553,91 +333,40 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                    ctx->decrypt_cb, ctx->decrypt_state);
   if (res != VPX_CODEC_OK) return res;
 
-  if (ctx->frame_parallel_decode) {
-    // Decode in frame parallel mode. When decoding in this mode, the frame
-    // passed to the decoder must be either a normal frame or a superframe with
-    // superframe index so the decoder could get each frame's start position
-    // in the superframe.
-    if (frame_count > 0) {
-      int i;
+  if (ctx->svc_decoding && ctx->svc_spatial_layer < frame_count - 1)
+    frame_count = ctx->svc_spatial_layer + 1;
 
-      for (i = 0; i < frame_count; ++i) {
-        const uint8_t *data_start_copy = data_start;
-        const uint32_t frame_size = frame_sizes[i];
-        if (data_start < data ||
-            frame_size > (uint32_t)(data_end - data_start)) {
-          set_error_detail(ctx, "Invalid frame size in index");
-          return VPX_CODEC_CORRUPT_FRAME;
-        }
+  // Decode in serial mode.
+  if (frame_count > 0) {
+    int i;
 
-        if (ctx->available_threads == 0) {
-          // No more threads for decoding. Wait until the next output worker
-          // finishes decoding. Then copy the decoded frame into cache.
-          if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
-            wait_worker_and_cache_frame(ctx);
-          } else {
-            // TODO(hkuang): Add unit test to test this path.
-            set_error_detail(ctx, "Frame output cache is full.");
-            return VPX_CODEC_ERROR;
-          }
-        }
-
-        res =
-            decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
-        if (res != VPX_CODEC_OK) return res;
-        data_start += frame_size;
-      }
-    } else {
-      if (ctx->available_threads == 0) {
-        // No more threads for decoding. Wait until the next output worker
-        // finishes decoding. Then copy the decoded frame into cache.
-        if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
-          wait_worker_and_cache_frame(ctx);
-        } else {
-          // TODO(hkuang): Add unit test to test this path.
-          set_error_detail(ctx, "Frame output cache is full.");
-          return VPX_CODEC_ERROR;
-        }
+    for (i = 0; i < frame_count; ++i) {
+      const uint8_t *data_start_copy = data_start;
+      const uint32_t frame_size = frame_sizes[i];
+      vpx_codec_err_t res;
+      if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) {
+        set_error_detail(ctx, "Invalid frame size in index");
+        return VPX_CODEC_CORRUPT_FRAME;
       }
 
-      res = decode_one(ctx, &data, data_sz, user_priv, deadline);
+      res = decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
       if (res != VPX_CODEC_OK) return res;
+
+      data_start += frame_size;
     }
   } else {
-    // Decode in serial mode.
-    if (frame_count > 0) {
-      int i;
+    while (data_start < data_end) {
+      const uint32_t frame_size = (uint32_t)(data_end - data_start);
+      const vpx_codec_err_t res =
+          decode_one(ctx, &data_start, frame_size, user_priv, deadline);
+      if (res != VPX_CODEC_OK) return res;
 
-      for (i = 0; i < frame_count; ++i) {
-        const uint8_t *data_start_copy = data_start;
-        const uint32_t frame_size = frame_sizes[i];
-        vpx_codec_err_t res;
-        if (data_start < data ||
-            frame_size > (uint32_t)(data_end - data_start)) {
-          set_error_detail(ctx, "Invalid frame size in index");
-          return VPX_CODEC_CORRUPT_FRAME;
-        }
-
-        res =
-            decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
-        if (res != VPX_CODEC_OK) return res;
-
-        data_start += frame_size;
-      }
-    } else {
+      // Account for suboptimal termination by the encoder.
       while (data_start < data_end) {
-        const uint32_t frame_size = (uint32_t)(data_end - data_start);
-        const vpx_codec_err_t res =
-            decode_one(ctx, &data_start, frame_size, user_priv, deadline);
-        if (res != VPX_CODEC_OK) return res;
-
-        // Account for suboptimal termination by the encoder.
-        while (data_start < data_end) {
-          const uint8_t marker =
-              read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start);
-          if (marker) break;
-          ++data_start;
-        }
+        const uint8_t marker =
+            read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start);
+        if (marker) break;
+        ++data_start;
       }
     }
   }
@@ -645,80 +374,28 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
   return res;
 }
 
-static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) {
-  RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
-  // Decrease reference count of last output frame in frame parallel mode.
-  if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
-    BufferPool *const pool = ctx->buffer_pool;
-    lock_buffer_pool(pool);
-    decrease_ref_count(ctx->last_show_frame, frame_bufs, pool);
-    unlock_buffer_pool(pool);
-  }
-}
-
 static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
                                       vpx_codec_iter_t *iter) {
   vpx_image_t *img = NULL;
 
-  // Only return frame when all the cpu are busy or
-  // application fluhsed the decoder in frame parallel decode.
-  if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
-      !ctx->flushed) {
-    return NULL;
-  }
+  // Legacy parameter carried over from VP8. Has no effect for VP9 since we
+  // always return only 1 frame per decode call.
+  (void)iter;
 
-  // Output the frames in the cache first.
-  if (ctx->num_cache_frames > 0) {
-    release_last_output_frame(ctx);
-    ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
-    if (ctx->need_resync) return NULL;
-    img = &ctx->frame_cache[ctx->frame_cache_read].img;
-    ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
-    --ctx->num_cache_frames;
-    return img;
-  }
-
-  // iter acts as a flip flop, so an image is only returned on the first
-  // call to get_frame.
-  if (*iter == NULL && ctx->frame_workers != NULL) {
-    do {
-      YV12_BUFFER_CONFIG sd;
-      vp9_ppflags_t flags = { 0, 0, 0 };
-      const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-      VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      ctx->next_output_worker_id =
-          (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
-      if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
-        set_ppflags(ctx, &flags);
-      // Wait for the frame from worker thread.
-      if (winterface->sync(worker)) {
-        // Check if worker has received any frames.
-        if (frame_worker_data->received_frame == 1) {
-          ++ctx->available_threads;
-          frame_worker_data->received_frame = 0;
-          check_resync(ctx, frame_worker_data->pbi);
-        }
-        if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
-          VP9_COMMON *const cm = &frame_worker_data->pbi->common;
-          RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-          release_last_output_frame(ctx);
-          ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
-          if (ctx->need_resync) return NULL;
-          yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
-          ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-          img = &ctx->img;
-          return img;
-        }
-      } else {
-        // Decoding failed. Release the worker thread.
-        frame_worker_data->received_frame = 0;
-        ++ctx->available_threads;
-        ctx->need_resync = 1;
-        if (ctx->flushed != 1) return NULL;
-      }
-    } while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
+  if (ctx->pbi != NULL) {
+    YV12_BUFFER_CONFIG sd;
+    vp9_ppflags_t flags = { 0, 0, 0 };
+    if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) set_ppflags(ctx, &flags);
+    if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) {
+      VP9_COMMON *const cm = &ctx->pbi->common;
+      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+      ctx->last_show_frame = ctx->pbi->common.new_fb_idx;
+      if (ctx->need_resync) return NULL;
+      yuvconfig2image(&ctx->img, &sd, ctx->user_priv);
+      ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+      img = &ctx->img;
+      return img;
+    }
   }
   return NULL;
 }
@@ -728,7 +405,7 @@ static vpx_codec_err_t decoder_set_fb_fn(
     vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
   if (cb_get == NULL || cb_release == NULL) {
     return VPX_CODEC_INVALID_PARAM;
-  } else if (ctx->frame_workers == NULL) {
+  } else if (ctx->pbi == NULL) {
     // If the decoder has already been initialized, do not accept changes to
     // the frame buffer functions.
     ctx->get_ext_fb_cb = cb_get;
@@ -744,21 +421,12 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (data) {
     vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
     YV12_BUFFER_CONFIG sd;
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-    return vp9_set_reference_dec(&frame_worker_data->pbi->common,
-                                 ref_frame_to_vp9_reframe(frame->frame_type),
-                                 &sd);
+    return vp9_set_reference_dec(
+        &ctx->pbi->common, ref_frame_to_vp9_reframe(frame->frame_type), &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
   }
@@ -768,20 +436,12 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
                                            va_list args) {
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (data) {
     vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
     YV12_BUFFER_CONFIG sd;
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-    return vp9_copy_reference_dec(frame_worker_data->pbi,
-                                  (VP9_REFFRAME)frame->frame_type, &sd);
+    return vp9_copy_reference_dec(ctx->pbi, (VP9_REFFRAME)frame->frame_type,
+                                  &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
   }
@@ -791,17 +451,9 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (data) {
     YV12_BUFFER_CONFIG *fb;
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
+    fb = get_ref_frame(&ctx->pbi->common, data->idx);
     if (fb == NULL) return VPX_CODEC_ERROR;
     yuvconfig2image(&data->img, fb, NULL);
     return VPX_CODEC_OK;
@@ -829,22 +481,21 @@ static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx,
 #endif
 }
 
+static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL || ctx->pbi == NULL) return VPX_CODEC_INVALID_PARAM;
+  *arg = ctx->pbi->common.base_qindex;
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
                                                  va_list args) {
   int *const update_info = va_arg(args, int *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (update_info) {
-    if (ctx->frame_workers) {
-      VPxWorker *const worker = ctx->frame_workers;
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      *update_info = frame_worker_data->pbi->refresh_frame_flags;
+    if (ctx->pbi != NULL) {
+      *update_info = ctx->pbi->refresh_frame_flags;
       return VPX_CODEC_OK;
     } else {
       return VPX_CODEC_ERROR;
@@ -859,14 +510,9 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
   int *corrupted = va_arg(args, int *);
 
   if (corrupted) {
-    if (ctx->frame_workers) {
-      VPxWorker *const worker = ctx->frame_workers;
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      RefCntBuffer *const frame_bufs =
-          frame_worker_data->pbi->common.buffer_pool->frame_bufs;
-      if (frame_worker_data->pbi->common.frame_to_show == NULL)
-        return VPX_CODEC_ERROR;
+    if (ctx->pbi != NULL) {
+      RefCntBuffer *const frame_bufs = ctx->pbi->common.buffer_pool->frame_bufs;
+      if (ctx->pbi->common.frame_to_show == NULL) return VPX_CODEC_ERROR;
       if (ctx->last_show_frame >= 0)
         *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
       return VPX_CODEC_OK;
@@ -882,18 +528,9 @@ static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx,
                                            va_list args) {
   int *const frame_size = va_arg(args, int *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (frame_size) {
-    if (ctx->frame_workers) {
-      VPxWorker *const worker = ctx->frame_workers;
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    if (ctx->pbi != NULL) {
+      const VP9_COMMON *const cm = &ctx->pbi->common;
       frame_size[0] = cm->width;
       frame_size[1] = cm->height;
       return VPX_CODEC_OK;
@@ -909,18 +546,9 @@ static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx,
                                             va_list args) {
   int *const render_size = va_arg(args, int *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (render_size) {
-    if (ctx->frame_workers) {
-      VPxWorker *const worker = ctx->frame_workers;
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    if (ctx->pbi != NULL) {
+      const VP9_COMMON *const cm = &ctx->pbi->common;
       render_size[0] = cm->render_width;
       render_size[1] = cm->render_height;
       return VPX_CODEC_OK;
@@ -935,13 +563,10 @@ static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   unsigned int *const bit_depth = va_arg(args, unsigned int *);
-  VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
 
   if (bit_depth) {
-    if (worker) {
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    if (ctx->pbi != NULL) {
+      const VP9_COMMON *const cm = &ctx->pbi->common;
       *bit_depth = cm->bit_depth;
       return VPX_CODEC_OK;
     } else {
@@ -980,10 +605,8 @@ static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx,
     return VPX_CODEC_INVALID_PARAM;
 
   ctx->byte_alignment = byte_alignment;
-  if (ctx->frame_workers) {
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->pbi->common.byte_alignment = byte_alignment;
+  if (ctx->pbi != NULL) {
+    ctx->pbi->common.byte_alignment = byte_alignment;
   }
   return VPX_CODEC_OK;
 }
@@ -992,15 +615,23 @@ static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx,
                                                  va_list args) {
   ctx->skip_loop_filter = va_arg(args, int);
 
-  if (ctx->frame_workers) {
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
+  if (ctx->pbi != NULL) {
+    ctx->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
   }
 
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_spatial_layer_svc(vpx_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  ctx->svc_decoding = 1;
+  ctx->svc_spatial_layer = va_arg(args, int);
+  if (ctx->svc_spatial_layer < 0)
+    return VPX_CODEC_INVALID_PARAM;
+  else
+    return VPX_CODEC_OK;
+}
+
 static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { VP8_COPY_REFERENCE, ctrl_copy_reference },
 
@@ -1011,8 +642,10 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { VPXD_SET_DECRYPTOR, ctrl_set_decryptor },
   { VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
   { VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
+  { VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc },
 
   // Getters
+  { VPXD_GET_LAST_QUANTIZER, ctrl_get_quantizer },
   { VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates },
   { VP8D_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted },
   { VP9_GET_REFERENCE, ctrl_get_reference },
@@ -1029,7 +662,10 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
 CODEC_INTERFACE(vpx_codec_vp9_dx) = {
   "WebM Project VP9 Decoder" VERSION_STRING,
   VPX_CODEC_INTERNAL_ABI_VERSION,
-  VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC |
+#if CONFIG_VP9_HIGHBITDEPTH
+  VPX_CODEC_CAP_HIGHBITDEPTH |
+#endif
+      VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC |
       VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER,  // vpx_codec_caps_t
   decoder_init,                             // vpx_codec_init_fn_t
   decoder_destroy,                          // vpx_codec_destroy_fn_t
diff --git a/libs/libvpx/vp9/vp9_dx_iface.h b/libs/libvpx/vp9/vp9_dx_iface.h
index cc3d51842a..18bc7ab0d6 100644
--- a/libs/libvpx/vp9/vp9_dx_iface.h
+++ b/libs/libvpx/vp9/vp9_dx_iface.h
@@ -15,19 +15,12 @@
 
 typedef vpx_codec_stream_info_t vp9_stream_info_t;
 
-// This limit is due to framebuffer numbers.
-// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
-#define FRAME_CACHE_SIZE 6  // Cache maximum 6 decoded frames.
-
-typedef struct cache_frame {
-  int fb_idx;
-  vpx_image_t img;
-} cache_frame;
-
 struct vpx_codec_alg_priv {
   vpx_codec_priv_t base;
   vpx_codec_dec_cfg_t cfg;
   vp9_stream_info_t si;
+  VP9Decoder *pbi;
+  void *user_priv;
   int postproc_cfg_set;
   vp8_postproc_cfg_t postproc_cfg;
   vpx_decrypt_cb decrypt_cb;
@@ -40,26 +33,18 @@ struct vpx_codec_alg_priv {
   int byte_alignment;
   int skip_loop_filter;
 
-  // Frame parallel related.
-  int frame_parallel_decode;  // frame-based threading.
-  VPxWorker *frame_workers;
-  int num_frame_workers;
-  int next_submit_worker_id;
-  int last_submit_worker_id;
-  int next_output_worker_id;
-  int available_threads;
-  cache_frame frame_cache[FRAME_CACHE_SIZE];
-  int frame_cache_write;
-  int frame_cache_read;
-  int num_cache_frames;
   int need_resync;  // wait for key/intra-only frame
-  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+  // BufferPool that holds all reference frames.
   BufferPool *buffer_pool;
 
   // External frame buffer info to save for VP9 common.
   void *ext_priv;  // Private data associated with the external frame buffers.
   vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
   vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+
+  // Allow for decoding up to a given spatial layer for SVC stream.
+  int svc_decoding;
+  int svc_spatial_layer;
 };
 
 #endif  // VP9_VP9_DX_IFACE_H_
diff --git a/libs/libvpx/vp9/vp9cx.mk b/libs/libvpx/vp9/vp9cx.mk
index a8ca0d5935..d633ed1429 100644
--- a/libs/libvpx/vp9/vp9cx.mk
+++ b/libs/libvpx/vp9/vp9cx.mk
@@ -39,9 +39,13 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
 VP9_CX_SRCS-yes += encoder/vp9_extend.h
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.h
+VP9_CX_SRCS-yes += encoder/vp9_frame_scale.c
+VP9_CX_SRCS-yes += encoder/vp9_job_queue.h
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.h
 VP9_CX_SRCS-yes += encoder/vp9_mcomp.h
+VP9_CX_SRCS-yes += encoder/vp9_multi_thread.c
+VP9_CX_SRCS-yes += encoder/vp9_multi_thread.h
 VP9_CX_SRCS-yes += encoder/vp9_encoder.h
 VP9_CX_SRCS-yes += encoder/vp9_quantize.h
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h
@@ -96,7 +100,8 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
+
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
@@ -104,12 +109,7 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 endif
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
-else
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-endif
 
 ifeq ($(ARCH_X86_64),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -117,20 +117,20 @@ endif
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
-ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
-endif
 
 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_denoiser_neon.c
 endif
 
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
 endif
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
 
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
@@ -138,6 +138,11 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
+
+# Strip unnecessary files with CONFIG_REALTIME_ONLY
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/libs/libvpx/vp9/vp9dx.mk b/libs/libvpx/vp9/vp9dx.mk
index 4c6fd00715..59f612b94c 100644
--- a/libs/libvpx/vp9/vp9dx.mk
+++ b/libs/libvpx/vp9/vp9dx.mk
@@ -24,8 +24,6 @@ VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
-VP9_DX_SRCS-yes += decoder/vp9_dthread.c
-VP9_DX_SRCS-yes += decoder/vp9_dthread.h
 VP9_DX_SRCS-yes += decoder/vp9_decoder.c
 VP9_DX_SRCS-yes += decoder/vp9_decoder.h
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
diff --git a/libs/libvpx/vpx/src/svc_encodeframe.c b/libs/libvpx/vpx/src/svc_encodeframe.c
index 5aa0b8ddb8..f633600c79 100644
--- a/libs/libvpx/vpx/src/svc_encodeframe.c
+++ b/libs/libvpx/vpx/src/svc_encodeframe.c
@@ -53,6 +53,10 @@ static const int DEFAULT_SCALE_FACTORS_NUM[VPX_SS_MAX_LAYERS] = { 4, 5, 7, 11,
 static const int DEFAULT_SCALE_FACTORS_DEN[VPX_SS_MAX_LAYERS] = { 16, 16, 16,
                                                                   16, 16 };
 
+static const int DEFAULT_SCALE_FACTORS_NUM_2x[VPX_SS_MAX_LAYERS] = { 1, 2, 4 };
+
+static const int DEFAULT_SCALE_FACTORS_DEN_2x[VPX_SS_MAX_LAYERS] = { 4, 4, 4 };
+
 typedef enum {
   QUANTIZER = 0,
   BITRATE,
@@ -127,9 +131,9 @@ static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt,
 static vpx_codec_err_t extract_option(LAYER_OPTION_TYPE type, char *input,
                                       int *value0, int *value1) {
   if (type == SCALE_FACTOR) {
-    *value0 = strtol(input, &input, 10);
+    *value0 = (int)strtol(input, &input, 10);
     if (*input++ != '/') return VPX_CODEC_INVALID_PARAM;
-    *value1 = strtol(input, &input, 10);
+    *value1 = (int)strtol(input, &input, 10);
 
     if (*value0 < option_min_values[SCALE_FACTOR] ||
         *value1 < option_min_values[SCALE_FACTOR] ||
@@ -156,6 +160,9 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
   char *token;
   const char *delim = ",";
   char *save_ptr;
+  int num_layers = svc_ctx->spatial_layers;
+  if (type == BITRATE)
+    num_layers = svc_ctx->spatial_layers * svc_ctx->temporal_layers;
 
   if (input == NULL || option0 == NULL ||
       (option1 == NULL && type == SCALE_FACTOR))
@@ -163,7 +170,7 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
 
   input_string = strdup(input);
   token = strtok_r(input_string, delim, &save_ptr);
-  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+  for (i = 0; i < num_layers; ++i) {
     if (token != NULL) {
       res = extract_option(type, token, option0 + i, option1 + i);
       if (res != VPX_CODEC_OK) break;
@@ -172,11 +179,11 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
       break;
     }
   }
-  if (res == VPX_CODEC_OK && i != svc_ctx->spatial_layers) {
+  if (res == VPX_CODEC_OK && i != num_layers) {
     svc_log(svc_ctx, SVC_LOG_ERROR,
             "svc: layer params type: %d    %d values required, "
             "but only %d specified\n",
-            type, svc_ctx->spatial_layers, i);
+            type, num_layers, i);
     res = VPX_CODEC_INVALID_PARAM;
   }
   free(input_string);
@@ -194,7 +201,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
   char *input_string;
   char *option_name;
   char *option_value;
-  char *input_ptr;
+  char *input_ptr = NULL;
   SvcInternal_t *const si = get_svc_internal(svc_ctx);
   vpx_codec_err_t res = VPX_CODEC_OK;
   int i, alt_ref_enabled = 0;
@@ -287,24 +294,30 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
   return VPX_CODEC_OK;
 }
 
-void assign_layer_bitrates(const SvcContext *svc_ctx,
-                           vpx_codec_enc_cfg_t *const enc_cfg) {
+vpx_codec_err_t assign_layer_bitrates(const SvcContext *svc_ctx,
+                                      vpx_codec_enc_cfg_t *const enc_cfg) {
   int i;
   const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
   int sl, tl, spatial_layer_target;
 
   if (svc_ctx->temporal_layering_mode != 0) {
     if (si->bitrates[0] != 0) {
-      enc_cfg->rc_target_bitrate = 0;
+      unsigned int total_bitrate = 0;
       for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
-        enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] = 0;
+        total_bitrate += si->bitrates[sl * svc_ctx->temporal_layers +
+                                      svc_ctx->temporal_layers - 1];
         for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
           enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] +=
               (unsigned int)si->bitrates[sl * svc_ctx->temporal_layers + tl];
           enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + tl] =
               si->bitrates[sl * svc_ctx->temporal_layers + tl];
+          if (tl > 0 && (si->bitrates[sl * svc_ctx->temporal_layers + tl] <=
+                         si->bitrates[sl * svc_ctx->temporal_layers + tl - 1]))
+            return VPX_CODEC_INVALID_PARAM;
         }
       }
+      if (total_bitrate != enc_cfg->rc_target_bitrate)
+        return VPX_CODEC_INVALID_PARAM;
     } else {
       float total = 0;
       float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
@@ -341,11 +354,14 @@ void assign_layer_bitrates(const SvcContext *svc_ctx,
     }
   } else {
     if (si->bitrates[0] != 0) {
-      enc_cfg->rc_target_bitrate = 0;
+      unsigned int total_bitrate = 0;
       for (i = 0; i < svc_ctx->spatial_layers; ++i) {
         enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i];
-        enc_cfg->rc_target_bitrate += si->bitrates[i];
+        enc_cfg->layer_target_bitrate[i] = (unsigned int)si->bitrates[i];
+        total_bitrate += si->bitrates[i];
       }
+      if (total_bitrate != enc_cfg->rc_target_bitrate)
+        return VPX_CODEC_INVALID_PARAM;
     } else {
       float total = 0;
       float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
@@ -368,6 +384,7 @@ void assign_layer_bitrates(const SvcContext *svc_ctx,
       }
     }
   }
+  return VPX_CODEC_OK;
 }
 
 vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
@@ -412,12 +429,28 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl];
     si->svc_params.speed_per_layer[sl] = svc_ctx->speed;
   }
-
+  if (enc_cfg->rc_end_usage == VPX_CBR && enc_cfg->g_pass == VPX_RC_ONE_PASS &&
+      svc_ctx->spatial_layers <= 3) {
+    for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
+      int sl2 = (svc_ctx->spatial_layers == 2) ? sl + 1 : sl;
+      si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM_2x[sl2];
+      si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN_2x[sl2];
+    }
+    if (svc_ctx->spatial_layers == 1) {
+      si->svc_params.scaling_factor_num[0] = 1;
+      si->svc_params.scaling_factor_den[0] = 1;
+    }
+  }
   for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
     for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
       i = sl * svc_ctx->temporal_layers + tl;
       si->svc_params.max_quantizers[i] = MAX_QUANTIZER;
       si->svc_params.min_quantizers[i] = 0;
+      if (enc_cfg->rc_end_usage == VPX_CBR &&
+          enc_cfg->g_pass == VPX_RC_ONE_PASS) {
+        si->svc_params.max_quantizers[i] = 56;
+        si->svc_params.min_quantizers[i] = 2;
+      }
     }
   }
 
@@ -442,7 +475,15 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
             (int)VPX_MAX_LAYERS);
     return VPX_CODEC_INVALID_PARAM;
   }
-  assign_layer_bitrates(svc_ctx, enc_cfg);
+  res = assign_layer_bitrates(svc_ctx, enc_cfg);
+  if (res != VPX_CODEC_OK) {
+    svc_log(svc_ctx, SVC_LOG_ERROR,
+            "layer bitrates incorrect: \n"
+            "1) spatial layer bitrates should sum up to target \n"
+            "2) temporal layer bitrates should be increasing within \n"
+            "a spatial layer \n");
+    return VPX_CODEC_INVALID_PARAM;
+  }
 
 #if CONFIG_SPATIAL_SVC
   for (i = 0; i < svc_ctx->spatial_layers; ++i)
@@ -518,8 +559,7 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   iter = NULL;
   while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
     switch (cx_pkt->kind) {
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
-#if CONFIG_SPATIAL_SVC
+#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC)
       case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: {
         int i;
         for (i = 0; i < svc_ctx->spatial_layers; ++i) {
@@ -553,10 +593,9 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
           si->bytes_sum[i] += cx_pkt->data.layer_sizes[i];
         break;
       }
-#endif
 #endif
       case VPX_CODEC_PSNR_PKT: {
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC)
         int j;
         svc_log(svc_ctx, SVC_LOG_DEBUG,
                 "frame: %d, layer: %d, PSNR(Total/Y/U/V): "
diff --git a/libs/libvpx/vpx/src/vpx_encoder.c b/libs/libvpx/vpx/src/vpx_encoder.c
index 4390cf7c8f..1cf2dca695 100644
--- a/libs/libvpx/vpx/src/vpx_encoder.c
+++ b/libs/libvpx/vpx/src/vpx_encoder.c
@@ -12,8 +12,11 @@
  * \brief Provides the high level interface to wrap encoder algorithms.
  *
  */
+#include <assert.h>
 #include <limits.h>
+#include <stdlib.h>
 #include <string.h>
+#include "vp8/common/blockd.h"
 #include "vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 
@@ -81,6 +84,8 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
     int i;
     void *mem_loc = NULL;
 
+    if (iface->enc.mr_get_mem_loc == NULL) return VPX_CODEC_INCAPABLE;
+
     if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) {
       for (i = 0; i < num_enc; i++) {
         vpx_codec_priv_enc_mr_cfg_t mr_cfg;
@@ -89,28 +94,27 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
         if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
             dsf->den > dsf->num) {
           res = VPX_CODEC_INVALID_PARAM;
-          break;
+        } else {
+          mr_cfg.mr_low_res_mode_info = mem_loc;
+          mr_cfg.mr_total_resolutions = num_enc;
+          mr_cfg.mr_encoder_id = num_enc - 1 - i;
+          mr_cfg.mr_down_sampling_factor.num = dsf->num;
+          mr_cfg.mr_down_sampling_factor.den = dsf->den;
+
+          /* Force Key-frame synchronization. Namely, encoder at higher
+           * resolution always use the same frame_type chosen by the
+           * lowest-resolution encoder.
+           */
+          if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED;
+
+          ctx->iface = iface;
+          ctx->name = iface->name;
+          ctx->priv = NULL;
+          ctx->init_flags = flags;
+          ctx->config.enc = cfg;
+          res = ctx->iface->init(ctx, &mr_cfg);
         }
 
-        mr_cfg.mr_low_res_mode_info = mem_loc;
-        mr_cfg.mr_total_resolutions = num_enc;
-        mr_cfg.mr_encoder_id = num_enc - 1 - i;
-        mr_cfg.mr_down_sampling_factor.num = dsf->num;
-        mr_cfg.mr_down_sampling_factor.den = dsf->den;
-
-        /* Force Key-frame synchronization. Namely, encoder at higher
-         * resolution always use the same frame_type chosen by the
-         * lowest-resolution encoder.
-         */
-        if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED;
-
-        ctx->iface = iface;
-        ctx->name = iface->name;
-        ctx->priv = NULL;
-        ctx->init_flags = flags;
-        ctx->config.enc = cfg;
-        res = ctx->iface->init(ctx, &mr_cfg);
-
         if (res) {
           const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL;
           /* Destroy current ctx */
@@ -124,10 +128,14 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
             vpx_codec_destroy(ctx);
             i--;
           }
+#if CONFIG_MULTI_RES_ENCODING
+          assert(mem_loc);
+          free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info);
+          free(mem_loc);
+#endif
+          return SAVE_STATUS(ctx, res);
         }
 
-        if (res) break;
-
         ctx++;
         cfg++;
         dsf++;
diff --git a/libs/libvpx/vpx/svc_context.h b/libs/libvpx/vpx/svc_context.h
index c8bde5832a..462785075c 100644
--- a/libs/libvpx/vpx/svc_context.h
+++ b/libs/libvpx/vpx/svc_context.h
@@ -54,7 +54,7 @@ typedef struct SvcInternal {
   // values extracted from option, quantizers
   vpx_svc_extra_cfg_t svc_params;
   int enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
-  int bitrates[VPX_SS_MAX_LAYERS];
+  int bitrates[VPX_MAX_LAYERS];
 
   // accumulated statistics
   double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS];  // total/Y/U/V
diff --git a/libs/libvpx/vpx/vp8cx.h b/libs/libvpx/vpx/vp8cx.h
index 8fa25e8bc0..c21b8b60db 100644
--- a/libs/libvpx/vpx/vp8cx.h
+++ b/libs/libvpx/vpx/vp8cx.h
@@ -333,11 +333,12 @@ enum vp8e_enc_control_id {
    *             2 = 4 tile columns
    *             .....
    *             n = 2**n tile columns
-   * The requested tile columns will be capped by encoder based on image size
-   * limitation (The minimum width of a tile column is 256 pixel, the maximum
-   * is 4096).
+   * The requested tile columns will be capped by the encoder based on image
+   * size limitations (The minimum width of a tile column is 256 pixels, the
+   * maximum is 4096).
    *
-   * By default, the value is 0, i.e. one single column tile for entire image.
+   * By default, the value is 6, i.e., the maximum number of tiles supported by
+   * the resolution.
    *
    * Supported in codecs: VP9
    */
@@ -368,10 +369,10 @@ enum vp8e_enc_control_id {
    * VP9 has a bitstream feature to reduce decoding dependency between frames
    * by turning off backward update of probability context used in encoding
    * and decoding. This allows staged parallel processing of more than one
-   * video frames in the decoder. This control function provides a mean to
+   * video frame in the decoder. This control function provides a means to
    * turn this feature on or off for bitstreams produced by encoder.
    *
-   * By default, this feature is off.
+   * By default, this feature is on.
    *
    * Supported in codecs: VP9
    */
@@ -407,7 +408,7 @@ enum vp8e_enc_control_id {
 
   /*!\brief Codec control function to set noise sensitivity.
    *
-   *  0: off, 1: On(YOnly)
+   *  0: off, 1: On(YOnly), 2: For SVC only, on top two spatial layers(YOnly)
    *
    * Supported in codecs: VP9
    */
@@ -443,6 +444,7 @@ enum vp8e_enc_control_id {
    * \note Valid parameter range:
    *              VP9E_CONTENT_DEFAULT = Regular video content (Default)
    *              VP9E_CONTENT_SCREEN  = Screen capture content
+   *              VP9E_CONTENT_FILM    = Film content: improves grain retention
    *
    * Supported in codecs: VP9
    */
@@ -547,6 +549,14 @@ enum vp8e_enc_control_id {
    */
   VP9E_SET_TARGET_LEVEL,
 
+  /*!\brief Codec control function to set row level multi-threading.
+  *
+  * 0 : off, 1 : on
+  *
+  * Supported in codecs: VP9
+  */
+  VP9E_SET_ROW_MT,
+
   /*!\brief Codec control function to get bitstream level.
    *
    * Supported in codecs: VP9
@@ -561,7 +571,31 @@ enum vp8e_enc_control_id {
    *
    * Supported in codecs: VP9
    */
-  VP9E_SET_ALT_REF_AQ
+  VP9E_SET_ALT_REF_AQ,
+
+  /*!\brief Boost percentage for Golden Frame in CBR mode.
+    *
+    * This value controls the amount of boost given to Golden Frame in
+    * CBR mode. It is expressed as a percentage of the average
+    * per-frame bitrate, with the special (and default) value 0 meaning
+    * the feature is off, i.e., no golden frame boost in CBR mode and
+    * average bitrate target is used.
+    *
+    * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+    * than average frame, set this to 100.
+    *
+    * Supported in codecs: VP8
+    */
+  VP8E_SET_GF_CBR_BOOST_PCT,
+
+  /*!\brief Codec control function to enable the extreme motion vector unit test
+   * in VP9. Please note that this is only used in motion vector unit test.
+   *
+   * 0 : off, 1 : MAX_EXTREME_MV, 2 : MIN_EXTREME_MV
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -663,6 +697,7 @@ typedef enum {
 typedef enum {
   VP9E_CONTENT_DEFAULT,
   VP9E_CONTENT_SCREEN,
+  VP9E_CONTENT_FILM,
   VP9E_CONTENT_INVALID
 } vp9e_tune_content;
 
@@ -769,6 +804,9 @@ VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
 #define VPX_CTRL_VP8E_SET_MAX_INTER_BITRATE_PCT
 
+VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT
+
 VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
 #define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE
 
@@ -820,9 +858,15 @@ VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
 VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int)
 #define VPX_CTRL_VP9E_SET_TARGET_LEVEL
 
+VPX_CTRL_USE_TYPE(VP9E_SET_ROW_MT, unsigned int)
+#define VPX_CTRL_VP9E_SET_ROW_MT
+
 VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
 #define VPX_CTRL_VP9E_GET_LEVEL
 
+VPX_CTRL_USE_TYPE(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
+#define VPX_CTRL_VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST
+
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
diff --git a/libs/libvpx/vpx/vp8dx.h b/libs/libvpx/vpx/vp8dx.h
index 88204acd37..398c670220 100644
--- a/libs/libvpx/vpx/vp8dx.h
+++ b/libs/libvpx/vpx/vp8dx.h
@@ -111,6 +111,19 @@ enum vp8_dec_control_id {
    */
   VP9_SET_SKIP_LOOP_FILTER,
 
+  /** control function to decode SVC stream up to the x spatial layers,
+   * where x is passed in through the control, and is 0 for base layer.
+   */
+  VP9_DECODE_SVC_SPATIAL_LAYER,
+
+  /*!\brief Codec control function to get last decoded frame quantizer.
+   *
+   * Return value uses internal quantizer scale defined by the codec.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VPXD_GET_LAST_QUANTIZER,
+
   VP8_DECODER_CTRL_ID_MAX
 };
 
@@ -150,6 +163,8 @@ VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED, int *)
 #define VPX_CTRL_VP8D_GET_FRAME_CORRUPTED
 VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *)
 #define VPX_CTRL_VP8D_GET_LAST_REF_USED
+VPX_CTRL_USE_TYPE(VPXD_GET_LAST_QUANTIZER, int *)
+#define VPX_CTRL_VPXD_GET_LAST_QUANTIZER
 VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *)
 #define VPX_CTRL_VPXD_SET_DECRYPTOR
 VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *)
@@ -162,6 +177,10 @@ VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
 #define VPX_CTRL_VP9D_GET_FRAME_SIZE
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
+#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
+VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
+#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER
+VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int)
 
 /*!\endcond */
 /*! @} - end defgroup vp8_decoder */
diff --git a/libs/libvpx/vpx/vpx_codec.h b/libs/libvpx/vpx/vpx_codec.h
index fe75d23872..ad05f4c74e 100644
--- a/libs/libvpx/vpx/vpx_codec.h
+++ b/libs/libvpx/vpx/vpx_codec.h
@@ -42,38 +42,39 @@
 extern "C" {
 #endif
 
-#include "./vpx_integer.h"
 #include "./vpx_image.h"
+#include "./vpx_integer.h"
 
 /*!\brief Decorator indicating a function is deprecated */
-#ifndef DEPRECATED
+#ifndef VPX_DEPRECATED
 #if defined(__GNUC__) && __GNUC__
-#define DEPRECATED __attribute__((deprecated))
+#define VPX_DEPRECATED __attribute__((deprecated))
 #elif defined(_MSC_VER)
-#define DEPRECATED
+#define VPX_DEPRECATED
 #else
-#define DEPRECATED
+#define VPX_DEPRECATED
 #endif
-#endif /* DEPRECATED */
+#endif /* VPX_DEPRECATED */
 
-#ifndef DECLSPEC_DEPRECATED
+#ifndef VPX_DECLSPEC_DEPRECATED
 #if defined(__GNUC__) && __GNUC__
-#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
+#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */
 #elif defined(_MSC_VER)
-/*!\brief \copydoc #DEPRECATED */
-#define DECLSPEC_DEPRECATED __declspec(deprecated)
+/*!\brief \copydoc #VPX_DEPRECATED */
+#define VPX_DECLSPEC_DEPRECATED __declspec(deprecated)
 #else
-#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
+#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */
 #endif
-#endif /* DECLSPEC_DEPRECATED */
+#endif /* VPX_DECLSPEC_DEPRECATED */
 
 /*!\brief Decorator indicating a function is potentially unused */
-#ifdef UNUSED
-#elif defined(__GNUC__) || defined(__clang__)
-#define UNUSED __attribute__((unused))
+#ifndef VPX_UNUSED
+#if defined(__GNUC__) || defined(__clang__)
+#define VPX_UNUSED __attribute__((unused))
 #else
-#define UNUSED
+#define VPX_UNUSED
 #endif
+#endif /* VPX_UNUSED */
 
 /*!\brief Current ABI version number
  *
@@ -83,7 +84,7 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_CODEC_ABI_VERSION (3 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_CODEC_ABI_VERSION (4 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/
 
 /*!\brief Algorithm return codes */
 typedef enum {
@@ -152,6 +153,10 @@ typedef long vpx_codec_caps_t;
 #define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
 #define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */
 
+/*! Can support images at greater than 8 bitdepth.
+ */
+#define VPX_CODEC_CAP_HIGHBITDEPTH 0x4
+
 /*! \brief Initialization-time Feature Enabling
  *
  *  Certain codec features must be known at initialization time, to allow for
@@ -409,7 +414,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
  */
 #define VPX_CTRL_USE_TYPE(id, typ)                                           \
   static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int, typ) \
-      UNUSED;                                                                \
+      VPX_UNUSED;                                                            \
                                                                              \
   static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx,        \
                                                 int ctrl_id, typ data) {     \
@@ -426,13 +431,13 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
  * It defines a static function with the correctly typed arguments as a
  * wrapper to the type-unsafe internal function.
  */
-#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ)                        \
-  DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
-      vpx_codec_ctx_t *, int, typ) DEPRECATED UNUSED;                \
-                                                                     \
-  DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
-      vpx_codec_ctx_t *ctx, int ctrl_id, typ data) {                 \
-    return vpx_codec_control_(ctx, ctrl_id, data);                   \
+#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ)                            \
+  VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
+      vpx_codec_ctx_t *, int, typ) VPX_DEPRECATED VPX_UNUSED;            \
+                                                                         \
+  VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
+      vpx_codec_ctx_t *ctx, int ctrl_id, typ data) {                     \
+    return vpx_codec_control_(ctx, ctrl_id, data);                       \
   } /**<\hideinitializer*/
 
 /*!\brief vpx_codec_control void type definition macro
@@ -447,7 +452,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
  */
 #define VPX_CTRL_VOID(id)                                               \
   static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int) \
-      UNUSED;                                                           \
+      VPX_UNUSED;                                                       \
                                                                         \
   static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx,   \
                                                 int ctrl_id) {          \
diff --git a/libs/libvpx/vpx/vpx_encoder.h b/libs/libvpx/vpx/vpx_encoder.h
index 28fcd5f999..464bc408c8 100644
--- a/libs/libvpx/vpx/vpx_encoder.h
+++ b/libs/libvpx/vpx/vpx_encoder.h
@@ -63,7 +63,7 @@ extern "C" {
  * fields to structures
  */
 #define VPX_ENCODER_ABI_VERSION \
-  (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+  (6 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -83,10 +83,6 @@ extern "C" {
  */
 #define VPX_CODEC_CAP_OUTPUT_PARTITION 0x20000
 
-/*! Can support input images at greater than 8 bitdepth.
- */
-#define VPX_CODEC_CAP_HIGHBITDEPTH 0x40000
-
 /*! \brief Initialization-time Feature Enabling
  *
  *  Certain codec features must be known at initialization time, to allow
@@ -158,9 +154,8 @@ enum vpx_codec_cx_pkt_kind {
   VPX_CODEC_STATS_PKT,      /**< Two-pass statistics for this frame */
   VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */
   VPX_CODEC_PSNR_PKT,       /**< PSNR statistics for this frame */
-// Spatial SVC is still experimental and may be removed before the next ABI
-// bump.
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+// Spatial SVC is still experimental and may be removed.
+#if defined(VPX_TEST_SPATIAL_SVC)
   VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/
   VPX_CODEC_SPATIAL_SVC_LAYER_PSNR,  /**< PSNR for each layer in this frame*/
 #endif
@@ -196,9 +191,8 @@ typedef struct vpx_codec_cx_pkt {
       double psnr[4];          /**< PSNR, total/y/u/v */
     } psnr;                    /**< data for PSNR packet */
     vpx_fixed_buf_t raw;       /**< data for arbitrary packets */
-// Spatial SVC is still experimental and may be removed before the next
-// ABI bump.
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+// Spatial SVC is still experimental and may be removed.
+#if defined(VPX_TEST_SPATIAL_SVC)
     size_t layer_sizes[VPX_SS_MAX_LAYERS];
     struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS];
 #endif
@@ -512,25 +506,31 @@ typedef struct vpx_codec_enc_cfg {
 
   /*!\brief Rate control adaptation undershoot control
    *
-   * This value, expressed as a percentage of the target bitrate,
+   * VP8: Expressed as a percentage of the target bitrate,
    * controls the maximum allowed adaptation speed of the codec.
    * This factor controls the maximum amount of bits that can
    * be subtracted from the target bitrate in order to compensate
    * for prior overshoot.
-   *
-   * Valid values in the range 0-1000.
+   * VP9: Expressed as a percentage of the target bitrate, a threshold
+   * undershoot level (current rate vs target) beyond which more agressive
+   * corrective measures are taken.
+   *   *
+   * Valid values in the range VP8:0-1000 VP9: 0-100.
    */
   unsigned int rc_undershoot_pct;
 
   /*!\brief Rate control adaptation overshoot control
    *
-   * This value, expressed as a percentage of the target bitrate,
+   * VP8: Expressed as a percentage of the target bitrate,
    * controls the maximum allowed adaptation speed of the codec.
    * This factor controls the maximum amount of bits that can
    * be added to the target bitrate in order to compensate for
    * prior undershoot.
+   * VP9: Expressed as a percentage of the target bitrate, a threshold
+   * overshoot level (current rate vs target) beyond which more agressive
+   * corrective measures are taken.
    *
-   * Valid values in the range 0-1000.
+   * Valid values in the range VP8:0-1000 VP9: 0-100.
    */
   unsigned int rc_overshoot_pct;
 
@@ -595,6 +595,13 @@ typedef struct vpx_codec_enc_cfg {
    */
   unsigned int rc_2pass_vbr_maxsection_pct;
 
+  /*!\brief Two-pass corpus vbr mode complexity control
+  * Used only in VP9: A value representing the corpus midpoint complexity
+  * for corpus vbr mode. This value defaults to 0 which disables corpus vbr
+  * mode in favour of normal vbr mode.
+  */
+  unsigned int rc_2pass_vbr_corpus_complexity;
+
   /*
    * keyframing settings (kf)
    */
diff --git a/libs/libvpx/vpx_dsp/add_noise.c b/libs/libvpx/vpx_dsp/add_noise.c
index a2b4c9010f..cda6ae8814 100644
--- a/libs/libvpx/vpx_dsp/add_noise.c
+++ b/libs/libvpx/vpx_dsp/add_noise.c
@@ -15,6 +15,7 @@
 #include "./vpx_dsp_rtcd.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/postproc.h"
 #include "vpx_ports/mem.h"
 
 void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
diff --git a/libs/libvpx/vpx_dsp/arm/avg_neon.c b/libs/libvpx/vpx_dsp/arm/avg_neon.c
index 001517d33e..fa7dd09600 100644
--- a/libs/libvpx/vpx_dsp/arm/avg_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/avg_neon.c
@@ -15,62 +15,48 @@
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
-static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
-  const uint32x4_t a = vpaddlq_u16(v_16x8);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
+uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) {
+  const uint8x16_t b = load_unaligned_u8q(a, a_stride);
+  const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
+  const uint32x2_t d = horizontal_add_uint16x8(c);
+  return vget_lane_u32(vrshr_n_u32(d, 4), 0);
 }
 
-unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) {
-  uint16x8_t v_sum;
-  uint32x2_t v_s0 = vdup_n_u32(0);
-  uint32x2_t v_s1 = vdup_n_u32(0);
-  v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
-  v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
-  v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
-  v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
-  v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
-  return (horizontal_add_u16x8(v_sum) + 8) >> 4;
-}
+uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) {
+  int i;
+  uint8x8_t b, c;
+  uint16x8_t sum;
+  uint32x2_t d;
+  b = vld1_u8(a);
+  a += a_stride;
+  c = vld1_u8(a);
+  a += a_stride;
+  sum = vaddl_u8(b, c);
 
-unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) {
-  uint8x8_t v_s0 = vld1_u8(s);
-  const uint8x8_t v_s1 = vld1_u8(s + p);
-  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
+  for (i = 0; i < 6; ++i) {
+    const uint8x8_t d = vld1_u8(a);
+    a += a_stride;
+    sum = vaddw_u8(sum, d);
+  }
 
-  v_s0 = vld1_u8(s + 2 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
+  d = horizontal_add_uint16x8(sum);
 
-  v_s0 = vld1_u8(s + 3 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 4 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 5 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 6 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 7 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  return (horizontal_add_u16x8(v_sum) + 32) >> 6;
+  return vget_lane_u32(vrshr_n_u32(d, 6), 0);
 }
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
-int vpx_satd_neon(const int16_t *coeff, int length) {
+int vpx_satd_neon(const tran_low_t *coeff, int length) {
   const int16x4_t zero = vdup_n_s16(0);
   int32x4_t accum = vdupq_n_s32(0);
 
   do {
-    const int16x8_t src0 = vld1q_s16(coeff);
-    const int16x8_t src8 = vld1q_s16(coeff + 8);
+    const int16x8_t src0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t src8 = load_tran_low_to_s16q(coeff + 8);
     accum = vabal_s16(accum, vget_low_s16(src0), zero);
     accum = vabal_s16(accum, vget_high_s16(src0), zero);
     accum = vabal_s16(accum, vget_low_s16(src8), zero);
@@ -153,7 +139,8 @@ int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
     ref += 16;
   }
 
-  return horizontal_add_u16x8(vec_sum);
+  return vget_lane_s16(vreinterpret_s16_u32(horizontal_add_uint16x8(vec_sum)),
+                       0);
 }
 
 // ref, src = [0, 510] - max diff = 16-bits
@@ -183,7 +170,7 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
 
   {
     // Note: 'total''s pairwise addition could be implemented similarly to
-    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
+    // horizontal_add_uint16x8(), but one less vpaddl with 'total' when paired
     // with the summation of 'sse' performed better on a Cortex-A15.
     const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'
     const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
diff --git a/libs/libvpx/vpx_dsp/arm/avg_pred_neon.c b/libs/libvpx/vpx_dsp/arm/avg_pred_neon.c
new file mode 100644
index 0000000000..1370ec2d2e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/avg_pred_neon.c
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  if (width > 8) {
+    int x, y;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; x += 16) {
+        const uint8x16_t p = vld1q_u8(pred + x);
+        const uint8x16_t r = vld1q_u8(ref + x);
+        const uint8x16_t avg = vrhaddq_u8(p, r);
+        vst1q_u8(comp + x, avg);
+      }
+      comp += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else {
+    int i;
+    for (i = 0; i < width * height; i += 16) {
+      const uint8x16_t p = vld1q_u8(pred);
+      uint8x16_t r;
+
+      if (width == 4) {
+        r = load_unaligned_u8q(ref, ref_stride);
+        ref += 4 * ref_stride;
+      } else {
+        const uint8x8_t r_0 = vld1_u8(ref);
+        const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
+        assert(width == 8);
+        r = vcombine_u8(r_0, r_1);
+        ref += 2 * ref_stride;
+      }
+      r = vrhaddq_u8(r, p);
+      vst1q_u8(comp, r);
+
+      pred += 16;
+      comp += 16;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/deblock_neon.c b/libs/libvpx/vpx_dsp/arm/deblock_neon.c
new file mode 100644
index 0000000000..1fb41d2992
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/deblock_neon.c
@@ -0,0 +1,485 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+extern const int16_t vpx_rv[];
+
+static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1,
+                               const uint8x8_t v0, const uint8x8_t b1,
+                               const uint8x8_t b2) {
+  const uint8x8_t k1 = vrhadd_u8(a2, a1);
+  const uint8x8_t k2 = vrhadd_u8(b2, b1);
+  const uint8x8_t k3 = vrhadd_u8(k1, k2);
+  return vrhadd_u8(k3, v0);
+}
+
+static uint8x8_t generate_mask(const uint8x8_t a2, const uint8x8_t a1,
+                               const uint8x8_t v0, const uint8x8_t b1,
+                               const uint8x8_t b2, const uint8x8_t filter) {
+  const uint8x8_t a2_v0 = vabd_u8(a2, v0);
+  const uint8x8_t a1_v0 = vabd_u8(a1, v0);
+  const uint8x8_t b1_v0 = vabd_u8(b1, v0);
+  const uint8x8_t b2_v0 = vabd_u8(b2, v0);
+
+  uint8x8_t max = vmax_u8(a2_v0, a1_v0);
+  max = vmax_u8(b1_v0, max);
+  max = vmax_u8(b2_v0, max);
+  return vclt_u8(max, filter);
+}
+
+static uint8x8_t generate_output(const uint8x8_t a2, const uint8x8_t a1,
+                                 const uint8x8_t v0, const uint8x8_t b1,
+                                 const uint8x8_t b2, const uint8x8_t filter) {
+  const uint8x8_t k_out = average_k_out(a2, a1, v0, b1, b2);
+  const uint8x8_t mask = generate_mask(a2, a1, v0, b1, b2, filter);
+
+  return vbsl_u8(mask, k_out, v0);
+}
+
+// Same functions but for uint8x16_t.
+static uint8x16_t average_k_outq(const uint8x16_t a2, const uint8x16_t a1,
+                                 const uint8x16_t v0, const uint8x16_t b1,
+                                 const uint8x16_t b2) {
+  const uint8x16_t k1 = vrhaddq_u8(a2, a1);
+  const uint8x16_t k2 = vrhaddq_u8(b2, b1);
+  const uint8x16_t k3 = vrhaddq_u8(k1, k2);
+  return vrhaddq_u8(k3, v0);
+}
+
+static uint8x16_t generate_maskq(const uint8x16_t a2, const uint8x16_t a1,
+                                 const uint8x16_t v0, const uint8x16_t b1,
+                                 const uint8x16_t b2, const uint8x16_t filter) {
+  const uint8x16_t a2_v0 = vabdq_u8(a2, v0);
+  const uint8x16_t a1_v0 = vabdq_u8(a1, v0);
+  const uint8x16_t b1_v0 = vabdq_u8(b1, v0);
+  const uint8x16_t b2_v0 = vabdq_u8(b2, v0);
+
+  uint8x16_t max = vmaxq_u8(a2_v0, a1_v0);
+  max = vmaxq_u8(b1_v0, max);
+  max = vmaxq_u8(b2_v0, max);
+  return vcltq_u8(max, filter);
+}
+
+static uint8x16_t generate_outputq(const uint8x16_t a2, const uint8x16_t a1,
+                                   const uint8x16_t v0, const uint8x16_t b1,
+                                   const uint8x16_t b2,
+                                   const uint8x16_t filter) {
+  const uint8x16_t k_out = average_k_outq(a2, a1, v0, b1, b2);
+  const uint8x16_t mask = generate_maskq(a2, a1, v0, b1, b2, filter);
+
+  return vbslq_u8(mask, k_out, v0);
+}
+
+void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int dst_stride, int cols,
+                                               uint8_t *f, int size) {
+  uint8_t *src, *dst;
+  int row;
+  int col;
+
+  // Process a stripe of macroblocks. The stripe will be a multiple of 16 (for
+  // Y) or 8 (for U/V) wide (cols) and the height (size) will be 16 (for Y) or 8
+  // (for U/V).
+  assert((size == 8 || size == 16) && cols % 8 == 0);
+
+  // While columns of length 16 can be processed, load them.
+  for (col = 0; col < cols - 8; col += 16) {
+    uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7;
+    src = src_ptr - 2 * src_stride;
+    dst = dst_ptr;
+
+    a0 = vld1q_u8(src);
+    src += src_stride;
+    a1 = vld1q_u8(src);
+    src += src_stride;
+    a2 = vld1q_u8(src);
+    src += src_stride;
+    a3 = vld1q_u8(src);
+    src += src_stride;
+
+    for (row = 0; row < size; row += 4) {
+      uint8x16_t v_out_0, v_out_1, v_out_2, v_out_3;
+      const uint8x16_t filterq = vld1q_u8(f + col);
+
+      a4 = vld1q_u8(src);
+      src += src_stride;
+      a5 = vld1q_u8(src);
+      src += src_stride;
+      a6 = vld1q_u8(src);
+      src += src_stride;
+      a7 = vld1q_u8(src);
+      src += src_stride;
+
+      v_out_0 = generate_outputq(a0, a1, a2, a3, a4, filterq);
+      v_out_1 = generate_outputq(a1, a2, a3, a4, a5, filterq);
+      v_out_2 = generate_outputq(a2, a3, a4, a5, a6, filterq);
+      v_out_3 = generate_outputq(a3, a4, a5, a6, a7, filterq);
+
+      vst1q_u8(dst, v_out_0);
+      dst += dst_stride;
+      vst1q_u8(dst, v_out_1);
+      dst += dst_stride;
+      vst1q_u8(dst, v_out_2);
+      dst += dst_stride;
+      vst1q_u8(dst, v_out_3);
+      dst += dst_stride;
+
+      // Rotate over to the next slot.
+      a0 = a4;
+      a1 = a5;
+      a2 = a6;
+      a3 = a7;
+    }
+
+    src_ptr += 16;
+    dst_ptr += 16;
+  }
+
+  // Clean up any left over column of length 8.
+  if (col != cols) {
+    uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
+    src = src_ptr - 2 * src_stride;
+    dst = dst_ptr;
+
+    a0 = vld1_u8(src);
+    src += src_stride;
+    a1 = vld1_u8(src);
+    src += src_stride;
+    a2 = vld1_u8(src);
+    src += src_stride;
+    a3 = vld1_u8(src);
+    src += src_stride;
+
+    for (row = 0; row < size; row += 4) {
+      uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3;
+      const uint8x8_t filter = vld1_u8(f + col);
+
+      a4 = vld1_u8(src);
+      src += src_stride;
+      a5 = vld1_u8(src);
+      src += src_stride;
+      a6 = vld1_u8(src);
+      src += src_stride;
+      a7 = vld1_u8(src);
+      src += src_stride;
+
+      v_out_0 = generate_output(a0, a1, a2, a3, a4, filter);
+      v_out_1 = generate_output(a1, a2, a3, a4, a5, filter);
+      v_out_2 = generate_output(a2, a3, a4, a5, a6, filter);
+      v_out_3 = generate_output(a3, a4, a5, a6, a7, filter);
+
+      vst1_u8(dst, v_out_0);
+      dst += dst_stride;
+      vst1_u8(dst, v_out_1);
+      dst += dst_stride;
+      vst1_u8(dst, v_out_2);
+      dst += dst_stride;
+      vst1_u8(dst, v_out_3);
+      dst += dst_stride;
+
+      // Rotate over to the next slot.
+      a0 = a4;
+      a1 = a5;
+      a2 = a6;
+      a3 = a7;
+    }
+
+    // Not strictly necessary but makes resetting dst_ptr easier.
+    dst_ptr += 8;
+  }
+
+  dst_ptr -= cols;
+
+  for (row = 0; row < size; row += 8) {
+    uint8x8_t a0, a1, a2, a3;
+    uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+    src = dst_ptr;
+    dst = dst_ptr;
+
+    // Load 8 values, transpose 4 of them, and discard 2 because they will be
+    // reloaded later.
+    load_and_transpose_u8_4x8(src, dst_stride, &a0, &a1, &a2, &a3);
+    a3 = a1;
+    a2 = a1 = a0;  // Extend left border.
+
+    src += 2;
+
+    for (col = 0; col < cols; col += 8) {
+      uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3, v_out_4, v_out_5, v_out_6,
+          v_out_7;
+      // Although the filter is meant to be applied vertically and is instead
+      // being applied horizontally here it's OK because it's set in blocks of 8
+      // (or 16).
+      const uint8x8_t filter = vld1_u8(f + col);
+
+      load_and_transpose_u8_8x8(src, dst_stride, &b0, &b1, &b2, &b3, &b4, &b5,
+                                &b6, &b7);
+
+      if (col + 8 == cols) {
+        // Last row. Extend border (b5).
+        b6 = b7 = b5;
+      }
+
+      v_out_0 = generate_output(a0, a1, a2, a3, b0, filter);
+      v_out_1 = generate_output(a1, a2, a3, b0, b1, filter);
+      v_out_2 = generate_output(a2, a3, b0, b1, b2, filter);
+      v_out_3 = generate_output(a3, b0, b1, b2, b3, filter);
+      v_out_4 = generate_output(b0, b1, b2, b3, b4, filter);
+      v_out_5 = generate_output(b1, b2, b3, b4, b5, filter);
+      v_out_6 = generate_output(b2, b3, b4, b5, b6, filter);
+      v_out_7 = generate_output(b3, b4, b5, b6, b7, filter);
+
+      transpose_and_store_u8_8x8(dst, dst_stride, v_out_0, v_out_1, v_out_2,
+                                 v_out_3, v_out_4, v_out_5, v_out_6, v_out_7);
+
+      a0 = b4;
+      a1 = b5;
+      a2 = b6;
+      a3 = b7;
+
+      src += 8;
+      dst += 8;
+    }
+
+    dst_ptr += 8 * dst_stride;
+  }
+}
+
+// sum += x;
+// sumsq += x * y;
+static void accumulate_sum_sumsq(const int16x4_t x, const int32x4_t xy,
+                                 int16x4_t *const sum, int32x4_t *const sumsq) {
+  const int16x4_t zero = vdup_n_s16(0);
+  const int32x4_t zeroq = vdupq_n_s32(0);
+
+  // Add in the first set because vext doesn't work with '0'.
+  *sum = vadd_s16(*sum, x);
+  *sumsq = vaddq_s32(*sumsq, xy);
+
+  // Shift x and xy to the right and sum. vext requires an immediate.
+  *sum = vadd_s16(*sum, vext_s16(zero, x, 1));
+  *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 1));
+
+  *sum = vadd_s16(*sum, vext_s16(zero, x, 2));
+  *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 2));
+
+  *sum = vadd_s16(*sum, vext_s16(zero, x, 3));
+  *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 3));
+}
+
+// Generate mask based on (sumsq * 15 - sum * sum < flimit)
+static uint16x4_t calculate_mask(const int16x4_t sum, const int32x4_t sumsq,
+                                 const int32x4_t f, const int32x4_t fifteen) {
+  const int32x4_t a = vmulq_s32(sumsq, fifteen);
+  const int32x4_t b = vmlsl_s16(a, sum, sum);
+  const uint32x4_t mask32 = vcltq_s32(b, f);
+  return vmovn_u32(mask32);
+}
+
+static uint8x8_t combine_mask(const int16x4_t sum_low, const int16x4_t sum_high,
+                              const int32x4_t sumsq_low,
+                              const int32x4_t sumsq_high, const int32x4_t f) {
+  const int32x4_t fifteen = vdupq_n_s32(15);
+  const uint16x4_t mask16_low = calculate_mask(sum_low, sumsq_low, f, fifteen);
+  const uint16x4_t mask16_high =
+      calculate_mask(sum_high, sumsq_high, f, fifteen);
+  return vmovn_u16(vcombine_u16(mask16_low, mask16_high));
+}
+
+// Apply filter of (8 + sum + s[c]) >> 4.
+static uint8x8_t filter_pixels(const int16x8_t sum, const uint8x8_t s) {
+  const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
+  const int16x8_t sum_s = vaddq_s16(sum, s16);
+
+  return vqrshrun_n_s16(sum_s, 4);
+}
+
+void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols,
+                                    int flimit) {
+  int row, col;
+  const int32x4_t f = vdupq_n_s32(flimit);
+
+  assert(cols % 8 == 0);
+
+  for (row = 0; row < rows; ++row) {
+    // Sum the first 8 elements, which are extended from s[0].
+    // sumsq gets primed with +16.
+    int sumsq = src[0] * src[0] * 9 + 16;
+    int sum = src[0] * 9;
+
+    uint8x8_t left_context, s, right_context;
+    int16x4_t sum_low, sum_high;
+    int32x4_t sumsq_low, sumsq_high;
+
+    // Sum (+square) the next 6 elements.
+    // Skip [0] because it's included above.
+    for (col = 1; col <= 6; ++col) {
+      sumsq += src[col] * src[col];
+      sum += src[col];
+    }
+
+    // Prime the sums. Later the loop uses the _high values to prime the new
+    // vectors.
+    sumsq_high = vdupq_n_s32(sumsq);
+    sum_high = vdup_n_s16(sum);
+
+    // Manually extend the left border.
+    left_context = vdup_n_u8(src[0]);
+
+    for (col = 0; col < cols; col += 8) {
+      uint8x8_t mask, output;
+      int16x8_t x, y;
+      int32x4_t xy_low, xy_high;
+
+      s = vld1_u8(src + col);
+
+      if (col + 8 == cols) {
+        // Last row. Extend border.
+        right_context = vdup_n_u8(src[col + 7]);
+      } else {
+        right_context = vld1_u8(src + col + 7);
+      }
+
+      x = vreinterpretq_s16_u16(vsubl_u8(right_context, left_context));
+      y = vreinterpretq_s16_u16(vaddl_u8(right_context, left_context));
+      xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
+      xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
+
+      // Catch up to the last sum'd value.
+      sum_low = vdup_lane_s16(sum_high, 3);
+      sumsq_low = vdupq_lane_s32(vget_high_s32(sumsq_high), 1);
+
+      accumulate_sum_sumsq(vget_low_s16(x), xy_low, &sum_low, &sumsq_low);
+
+      // Need to do this sequentially because we need the max value from
+      // sum_low.
+      sum_high = vdup_lane_s16(sum_low, 3);
+      sumsq_high = vdupq_lane_s32(vget_high_s32(sumsq_low), 1);
+
+      accumulate_sum_sumsq(vget_high_s16(x), xy_high, &sum_high, &sumsq_high);
+
+      mask = combine_mask(sum_low, sum_high, sumsq_low, sumsq_high, f);
+
+      output = filter_pixels(vcombine_s16(sum_low, sum_high), s);
+      output = vbsl_u8(mask, output, s);
+
+      vst1_u8(src + col, output);
+
+      left_context = s;
+    }
+
+    src += pitch;
+  }
+}
+
+// Apply filter of (vpx_rv + sum + s[c]) >> 4.
+static uint8x8_t filter_pixels_rv(const int16x8_t sum, const uint8x8_t s,
+                                  const int16x8_t rv) {
+  const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
+  const int16x8_t sum_s = vaddq_s16(sum, s16);
+  const int16x8_t rounded = vaddq_s16(sum_s, rv);
+
+  return vqshrun_n_s16(rounded, 4);
+}
+
+void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols,
+                               int flimit) {
+  int row, col, i;
+  const int32x4_t f = vdupq_n_s32(flimit);
+  uint8x8_t below_context = vdup_n_u8(0);
+
+  // 8 columns are processed at a time.
+  // If rows is less than 8 the bottom border extension fails.
+  assert(cols % 8 == 0);
+  assert(rows >= 8);
+
+  // Load and keep the first 8 values in memory. Process a vertical stripe that
+  // is 8 wide.
+  for (col = 0; col < cols; col += 8) {
+    uint8x8_t s, above_context[8];
+    int16x8_t sum, sum_tmp;
+    int32x4_t sumsq_low, sumsq_high;
+
+    // Load and extend the top border.
+    s = vld1_u8(dst);
+    for (i = 0; i < 8; i++) {
+      above_context[i] = s;
+    }
+
+    sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s));
+
+    // sum * 9
+    sum = vmulq_n_s16(sum_tmp, 9);
+
+    // (sum * 9) * sum == sum * sum * 9
+    sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp));
+    sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp));
+
+    // Load and discard the next 6 values to prime sum and sumsq.
+    for (i = 1; i <= 6; ++i) {
+      const uint8x8_t a = vld1_u8(dst + i * pitch);
+      const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a));
+      sum = vaddq_s16(sum, b);
+
+      sumsq_low = vmlal_s16(sumsq_low, vget_low_s16(b), vget_low_s16(b));
+      sumsq_high = vmlal_s16(sumsq_high, vget_high_s16(b), vget_high_s16(b));
+    }
+
+    for (row = 0; row < rows; ++row) {
+      uint8x8_t mask, output;
+      int16x8_t x, y;
+      int32x4_t xy_low, xy_high;
+
+      s = vld1_u8(dst + row * pitch);
+
+      // Extend the bottom border.
+      if (row + 7 < rows) {
+        below_context = vld1_u8(dst + (row + 7) * pitch);
+      }
+
+      x = vreinterpretq_s16_u16(vsubl_u8(below_context, above_context[0]));
+      y = vreinterpretq_s16_u16(vaddl_u8(below_context, above_context[0]));
+      xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
+      xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
+
+      sum = vaddq_s16(sum, x);
+
+      sumsq_low = vaddq_s32(sumsq_low, xy_low);
+      sumsq_high = vaddq_s32(sumsq_high, xy_high);
+
+      mask = combine_mask(vget_low_s16(sum), vget_high_s16(sum), sumsq_low,
+                          sumsq_high, f);
+
+      output = filter_pixels_rv(sum, s, vld1q_s16(vpx_rv + (row & 127)));
+      output = vbsl_u8(mask, output, s);
+
+      vst1_u8(dst + row * pitch, output);
+
+      above_context[0] = above_context[1];
+      above_context[1] = above_context[2];
+      above_context[2] = above_context[3];
+      above_context[3] = above_context[4];
+      above_context[4] = above_context[5];
+      above_context[5] = above_context[6];
+      above_context[6] = above_context[7];
+      above_context[7] = s;
+    }
+
+    dst += 8;
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/libs/libvpx/vpx_dsp/arm/fdct16x16_neon.c
new file mode 100644
index 0000000000..6b2bebd097
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/fdct16x16_neon.c
@@ -0,0 +1,387 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+// Some builds of gcc 4.9.2 and .3 have trouble with some of the inline
+// functions.
+#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
+    __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
+
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
+  vpx_fdct16x16_c(input, output, stride);
+}
+
+#else
+
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
+  b[0] = vld1q_s16(a);
+  a += stride;
+  b[1] = vld1q_s16(a);
+  a += stride;
+  b[2] = vld1q_s16(a);
+  a += stride;
+  b[3] = vld1q_s16(a);
+  a += stride;
+  b[4] = vld1q_s16(a);
+  a += stride;
+  b[5] = vld1q_s16(a);
+  a += stride;
+  b[6] = vld1q_s16(a);
+  a += stride;
+  b[7] = vld1q_s16(a);
+  a += stride;
+  b[8] = vld1q_s16(a);
+  a += stride;
+  b[9] = vld1q_s16(a);
+  a += stride;
+  b[10] = vld1q_s16(a);
+  a += stride;
+  b[11] = vld1q_s16(a);
+  a += stride;
+  b[12] = vld1q_s16(a);
+  a += stride;
+  b[13] = vld1q_s16(a);
+  a += stride;
+  b[14] = vld1q_s16(a);
+  a += stride;
+  b[15] = vld1q_s16(a);
+}
+
+// Store 8 16x8 values, assuming stride == 16.
+static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
+  store_s16q_to_tran_low(a, b[0]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[1]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[2]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[3]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[4]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[5]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[6]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[7]);
+}
+
+// Load step of each pass. Add and subtract clear across the input, requiring
+// all 16 values to be loaded. For the first pass it also multiplies by 4.
+
+// To maybe reduce register usage this could be combined with the load() step to
+// get the first 4 and last 4 values, cross those, then load the middle 8 values
+// and cross them.
+static INLINE void cross_input(const int16x8_t *a /*[16]*/,
+                               int16x8_t *b /*[16]*/, const int pass) {
+  if (pass == 0) {
+    b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
+    b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
+    b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
+    b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
+    b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
+    b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
+    b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
+    b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
+
+    b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
+    b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
+    b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
+    b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
+    b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
+    b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
+    b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
+    b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
+  } else {
+    b[0] = vaddq_s16(a[0], a[15]);
+    b[1] = vaddq_s16(a[1], a[14]);
+    b[2] = vaddq_s16(a[2], a[13]);
+    b[3] = vaddq_s16(a[3], a[12]);
+    b[4] = vaddq_s16(a[4], a[11]);
+    b[5] = vaddq_s16(a[5], a[10]);
+    b[6] = vaddq_s16(a[6], a[9]);
+    b[7] = vaddq_s16(a[7], a[8]);
+
+    b[8] = vsubq_s16(a[7], a[8]);
+    b[9] = vsubq_s16(a[6], a[9]);
+    b[10] = vsubq_s16(a[5], a[10]);
+    b[11] = vsubq_s16(a[4], a[11]);
+    b[12] = vsubq_s16(a[3], a[12]);
+    b[13] = vsubq_s16(a[2], a[13]);
+    b[14] = vsubq_s16(a[1], a[14]);
+    b[15] = vsubq_s16(a[0], a[15]);
+  }
+}
+
+// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
+// because this only adds 1, not 1 << 2.
+static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
+  const int16x8_t one = vdupq_n_s16(1);
+  a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
+  a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
+  a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
+  a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
+  a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
+  a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
+  a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
+  a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
+  a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
+  a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
+  a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
+  a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
+  a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
+  a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
+  a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
+  a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
+}
+
+// fdct_round_shift((a +/- b) * c)
+static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_high_t c, int16x8_t *add,
+                                       int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c0 +/- b * c1)
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_coef_t c0,
+                                       const tran_coef_t c1, int16x8_t *add,
+                                       int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
+  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
+  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
+  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
+  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
+// are all in-place.
+static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
+                                 int16x8_t *b /*[8]*/) {
+  // Swap 16 bit elements.
+  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements.
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+                                   vreinterpretq_s32_s16(c3.val[0]));
+  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+                                   vreinterpretq_s32_s16(c3.val[1]));
+
+  // Swap 64 bit elements
+  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+  b[0] = e0.val[0];
+  b[1] = e1.val[0];
+  b[2] = e2.val[0];
+  b[3] = e3.val[0];
+  b[4] = e0.val[1];
+  b[5] = e1.val[1];
+  b[6] = e2.val[1];
+  b[7] = e3.val[1];
+}
+
+// Main body of fdct16x16.
+static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) {
+  int16x8_t s[8];
+  int16x8_t x[4];
+  int16x8_t step[8];
+
+  // stage 1
+  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+  // even_results);"
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
+  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+  butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
+
+  //  Stage 2
+  // Re-using source s5/s6
+  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+  butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+
+  //  Stage 3
+  x[0] = vaddq_s16(s[4], s[5]);
+  x[1] = vsubq_s16(s[4], s[5]);
+  x[2] = vsubq_s16(s[7], s[6]);
+  x[3] = vaddq_s16(s[7], s[6]);
+
+  // Stage 4
+  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
+  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
+
+  // step 2
+  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+  // That file distinguished between "in_high" and "step1" but the only
+  // difference is that "in_high" is the first 8 values and "step 1" is the
+  // second. Here, since they are all in one array, "step1" values are += 8.
+
+  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+  butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+  butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+
+  // step 3
+  s[0] = vaddq_s16(in[8], s[3]);
+  s[1] = vaddq_s16(in[9], s[2]);
+  x[0] = vsubq_s16(in[9], s[2]);
+  x[1] = vsubq_s16(in[8], s[3]);
+  x[2] = vsubq_s16(in[15], s[4]);
+  x[3] = vsubq_s16(in[14], s[5]);
+  s[6] = vaddq_s16(in[14], s[5]);
+  s[7] = vaddq_s16(in[15], s[4]);
+
+  // step 4
+  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
+  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
+  butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
+
+  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
+  butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
+
+  // step 5
+  step[0] = vaddq_s16(s[0], s[1]);
+  step[1] = vsubq_s16(s[0], s[1]);
+  step[2] = vaddq_s16(x[1], s[2]);
+  step[3] = vsubq_s16(x[1], s[2]);
+  step[4] = vsubq_s16(x[2], s[5]);
+  step[5] = vaddq_s16(x[2], s[5]);
+  step[6] = vsubq_s16(s[7], s[6]);
+  step[7] = vaddq_s16(s[7], s[6]);
+
+  // step 6
+  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
+  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
+  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
+  // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
+  // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
+  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
+  // cospi_22_64)
+  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
+  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
+  butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
+                      &out[7]);
+  butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
+                      &out[15]);
+  butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
+                      &out[3]);
+  butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
+                      &out[11]);
+}
+
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
+  int16x8_t temp0[16];
+  int16x8_t temp1[16];
+  int16x8_t temp2[16];
+  int16x8_t temp3[16];
+
+  // Left half.
+  load(input, stride, temp0);
+  cross_input(temp0, temp1, 0);
+  dct_body(temp1, temp0);
+
+  // Right half.
+  load(input + 8, stride, temp1);
+  cross_input(temp1, temp2, 0);
+  dct_body(temp2, temp1);
+
+  // Transpose top left and top right quarters into one contiguous location to
+  // process to the top half.
+  transpose_8x8(&temp0[0], &temp2[0]);
+  transpose_8x8(&temp1[0], &temp2[8]);
+  partial_round_shift(temp2);
+  cross_input(temp2, temp3, 1);
+  dct_body(temp3, temp2);
+  transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
+                    &temp2[5], &temp2[6], &temp2[7]);
+  transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
+                    &temp2[13], &temp2[14], &temp2[15]);
+  store(output, temp2);
+  store(output + 8, temp2 + 8);
+  output += 8 * 16;
+
+  // Transpose bottom left and bottom right quarters into one contiguous
+  // location to process to the bottom half.
+  transpose_8x8(&temp0[8], &temp1[0]);
+  transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
+                    &temp1[13], &temp1[14], &temp1[15]);
+  partial_round_shift(temp1);
+  cross_input(temp1, temp0, 1);
+  dct_body(temp0, temp1);
+  transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
+                    &temp1[5], &temp1[6], &temp1[7]);
+  transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
+                    &temp1[13], &temp1[14], &temp1[15]);
+  store(output, temp1);
+  store(output + 8, temp1 + 8);
+}
+#endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
+        // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
diff --git a/libs/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/libs/libvpx/vpx_dsp/arm/fdct32x32_neon.c
new file mode 100644
index 0000000000..e9cd34904b
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/fdct32x32_neon.c
@@ -0,0 +1,1507 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+// Most gcc 4.9 distributions outside of Android do not generate correct code
+// for this function.
+#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
+    __GNUC__ == 4 && __GNUC_MINOR__ <= 9
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+  vpx_fdct32x32_c(input, output, stride);
+}
+
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+                           int stride) {
+  vpx_fdct32x32_rd_c(input, output, stride);
+}
+
+#else
+
+#define LOAD_INCREMENT(src, stride, dest, index) \
+  do {                                           \
+    dest[index] = vld1q_s16(src);                \
+    src += stride;                               \
+  } while (0)
+
+#define ADD_S16(src, index0, index1, dest, index3)      \
+  do {                                                  \
+    dest[index3] = vaddq_s16(src[index0], src[index1]); \
+  } while (0)
+
+#define ADD_SHIFT_S16(src, index0, index1)                             \
+  do {                                                                 \
+    src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
+  } while (0)
+
+// Load, cross, and multiply by 4. Load the first 8 and last 8, then the
+// middle
+// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
+  const int16_t *a_end = a + 24 * stride;
+  int16x8_t c[8];
+
+  LOAD_INCREMENT(a, stride, b, 0);
+  LOAD_INCREMENT(a, stride, b, 1);
+  LOAD_INCREMENT(a, stride, b, 2);
+  LOAD_INCREMENT(a, stride, b, 3);
+  LOAD_INCREMENT(a, stride, b, 4);
+  LOAD_INCREMENT(a, stride, b, 5);
+  LOAD_INCREMENT(a, stride, b, 6);
+  LOAD_INCREMENT(a, stride, b, 7);
+
+  LOAD_INCREMENT(a_end, stride, b, 24);
+  LOAD_INCREMENT(a_end, stride, b, 25);
+  LOAD_INCREMENT(a_end, stride, b, 26);
+  LOAD_INCREMENT(a_end, stride, b, 27);
+  LOAD_INCREMENT(a_end, stride, b, 28);
+  LOAD_INCREMENT(a_end, stride, b, 29);
+  LOAD_INCREMENT(a_end, stride, b, 30);
+  LOAD_INCREMENT(a_end, stride, b, 31);
+
+  ADD_S16(b, 0, 31, c, 0);
+  ADD_S16(b, 1, 30, c, 1);
+  ADD_S16(b, 2, 29, c, 2);
+  ADD_S16(b, 3, 28, c, 3);
+  ADD_S16(b, 4, 27, c, 4);
+  ADD_S16(b, 5, 26, c, 5);
+  ADD_S16(b, 6, 25, c, 6);
+  ADD_S16(b, 7, 24, c, 7);
+
+  ADD_SHIFT_S16(b, 7, 24);
+  ADD_SHIFT_S16(b, 6, 25);
+  ADD_SHIFT_S16(b, 5, 26);
+  ADD_SHIFT_S16(b, 4, 27);
+  ADD_SHIFT_S16(b, 3, 28);
+  ADD_SHIFT_S16(b, 2, 29);
+  ADD_SHIFT_S16(b, 1, 30);
+  ADD_SHIFT_S16(b, 0, 31);
+
+  b[0] = vshlq_n_s16(c[0], 2);
+  b[1] = vshlq_n_s16(c[1], 2);
+  b[2] = vshlq_n_s16(c[2], 2);
+  b[3] = vshlq_n_s16(c[3], 2);
+  b[4] = vshlq_n_s16(c[4], 2);
+  b[5] = vshlq_n_s16(c[5], 2);
+  b[6] = vshlq_n_s16(c[6], 2);
+  b[7] = vshlq_n_s16(c[7], 2);
+
+  LOAD_INCREMENT(a, stride, b, 8);
+  LOAD_INCREMENT(a, stride, b, 9);
+  LOAD_INCREMENT(a, stride, b, 10);
+  LOAD_INCREMENT(a, stride, b, 11);
+  LOAD_INCREMENT(a, stride, b, 12);
+  LOAD_INCREMENT(a, stride, b, 13);
+  LOAD_INCREMENT(a, stride, b, 14);
+  LOAD_INCREMENT(a, stride, b, 15);
+  LOAD_INCREMENT(a, stride, b, 16);
+  LOAD_INCREMENT(a, stride, b, 17);
+  LOAD_INCREMENT(a, stride, b, 18);
+  LOAD_INCREMENT(a, stride, b, 19);
+  LOAD_INCREMENT(a, stride, b, 20);
+  LOAD_INCREMENT(a, stride, b, 21);
+  LOAD_INCREMENT(a, stride, b, 22);
+  LOAD_INCREMENT(a, stride, b, 23);
+
+  ADD_S16(b, 8, 23, c, 0);
+  ADD_S16(b, 9, 22, c, 1);
+  ADD_S16(b, 10, 21, c, 2);
+  ADD_S16(b, 11, 20, c, 3);
+  ADD_S16(b, 12, 19, c, 4);
+  ADD_S16(b, 13, 18, c, 5);
+  ADD_S16(b, 14, 17, c, 6);
+  ADD_S16(b, 15, 16, c, 7);
+
+  ADD_SHIFT_S16(b, 15, 16);
+  ADD_SHIFT_S16(b, 14, 17);
+  ADD_SHIFT_S16(b, 13, 18);
+  ADD_SHIFT_S16(b, 12, 19);
+  ADD_SHIFT_S16(b, 11, 20);
+  ADD_SHIFT_S16(b, 10, 21);
+  ADD_SHIFT_S16(b, 9, 22);
+  ADD_SHIFT_S16(b, 8, 23);
+
+  b[8] = vshlq_n_s16(c[0], 2);
+  b[9] = vshlq_n_s16(c[1], 2);
+  b[10] = vshlq_n_s16(c[2], 2);
+  b[11] = vshlq_n_s16(c[3], 2);
+  b[12] = vshlq_n_s16(c[4], 2);
+  b[13] = vshlq_n_s16(c[5], 2);
+  b[14] = vshlq_n_s16(c[6], 2);
+  b[15] = vshlq_n_s16(c[7], 2);
+}
+
+#undef LOAD_INCREMENT
+#undef ADD_S16
+#undef ADD_SHIFT_S16
+
+#define STORE_S16(src, index, dest)           \
+  do {                                        \
+    store_s16q_to_tran_low(dest, src[index]); \
+    dest += 8;                                \
+  } while (0);
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+  STORE_S16(b, 0, a);
+  STORE_S16(b, 8, a);
+  STORE_S16(b, 16, a);
+  STORE_S16(b, 24, a);
+  STORE_S16(b, 1, a);
+  STORE_S16(b, 9, a);
+  STORE_S16(b, 17, a);
+  STORE_S16(b, 25, a);
+  STORE_S16(b, 2, a);
+  STORE_S16(b, 10, a);
+  STORE_S16(b, 18, a);
+  STORE_S16(b, 26, a);
+  STORE_S16(b, 3, a);
+  STORE_S16(b, 11, a);
+  STORE_S16(b, 19, a);
+  STORE_S16(b, 27, a);
+  STORE_S16(b, 4, a);
+  STORE_S16(b, 12, a);
+  STORE_S16(b, 20, a);
+  STORE_S16(b, 28, a);
+  STORE_S16(b, 5, a);
+  STORE_S16(b, 13, a);
+  STORE_S16(b, 21, a);
+  STORE_S16(b, 29, a);
+  STORE_S16(b, 6, a);
+  STORE_S16(b, 14, a);
+  STORE_S16(b, 22, a);
+  STORE_S16(b, 30, a);
+  STORE_S16(b, 7, a);
+  STORE_S16(b, 15, a);
+  STORE_S16(b, 23, a);
+  STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+// fdct_round_shift((a +/- b) * c)
+static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_high_t constant,
+                                       int16x8_t *add, int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c0 +/- b * c1)
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_coef_t constant0,
+                                       const tran_coef_t constant1,
+                                       int16x8_t *add, int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
+  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
+  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
+  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
+  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+}
+
+static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  a[0] = vaddq_s16(in[0], in[15]);
+  a[1] = vaddq_s16(in[1], in[14]);
+  a[2] = vaddq_s16(in[2], in[13]);
+  a[3] = vaddq_s16(in[3], in[12]);
+  a[4] = vaddq_s16(in[4], in[11]);
+  a[5] = vaddq_s16(in[5], in[10]);
+  a[6] = vaddq_s16(in[6], in[9]);
+  a[7] = vaddq_s16(in[7], in[8]);
+
+  a[8] = vsubq_s16(in[7], in[8]);
+  a[9] = vsubq_s16(in[6], in[9]);
+  a[10] = vsubq_s16(in[5], in[10]);
+  a[11] = vsubq_s16(in[4], in[11]);
+  a[12] = vsubq_s16(in[3], in[12]);
+  a[13] = vsubq_s16(in[2], in[13]);
+  a[14] = vsubq_s16(in[1], in[14]);
+  a[15] = vsubq_s16(in[0], in[15]);
+
+  a[16] = in[16];
+  a[17] = in[17];
+  a[18] = in[18];
+  a[19] = in[19];
+
+  butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
+  butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
+  butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
+  butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
+
+  a[28] = in[28];
+  a[29] = in[29];
+  a[30] = in[30];
+  a[31] = in[31];
+
+  // Stage 3.
+  b[0] = vaddq_s16(a[0], a[7]);
+  b[1] = vaddq_s16(a[1], a[6]);
+  b[2] = vaddq_s16(a[2], a[5]);
+  b[3] = vaddq_s16(a[3], a[4]);
+
+  b[4] = vsubq_s16(a[3], a[4]);
+  b[5] = vsubq_s16(a[2], a[5]);
+  b[6] = vsubq_s16(a[1], a[6]);
+  b[7] = vsubq_s16(a[0], a[7]);
+
+  b[8] = a[8];
+  b[9] = a[9];
+
+  butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+  butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+  b[14] = a[14];
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(in[16], a[23]);
+  b[17] = vaddq_s16(in[17], a[22]);
+  b[18] = vaddq_s16(in[18], a[21]);
+  b[19] = vaddq_s16(in[19], a[20]);
+
+  b[20] = vsubq_s16(in[19], a[20]);
+  b[21] = vsubq_s16(in[18], a[21]);
+  b[22] = vsubq_s16(in[17], a[22]);
+  b[23] = vsubq_s16(in[16], a[23]);
+
+  b[24] = vsubq_s16(in[31], a[24]);
+  b[25] = vsubq_s16(in[30], a[25]);
+  b[26] = vsubq_s16(in[29], a[26]);
+  b[27] = vsubq_s16(in[28], a[27]);
+
+  b[28] = vaddq_s16(in[28], a[27]);
+  b[29] = vaddq_s16(in[29], a[26]);
+  b[30] = vaddq_s16(in[30], a[25]);
+  b[31] = vaddq_s16(in[31], a[24]);
+
+  // Stage 4.
+  a[0] = vaddq_s16(b[0], b[3]);
+  a[1] = vaddq_s16(b[1], b[2]);
+  a[2] = vsubq_s16(b[1], b[2]);
+  a[3] = vsubq_s16(b[0], b[3]);
+
+  a[4] = b[4];
+
+  butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+  a[7] = b[7];
+
+  a[8] = vaddq_s16(b[8], b[11]);
+  a[9] = vaddq_s16(b[9], b[10]);
+  a[10] = vsubq_s16(b[9], b[10]);
+  a[11] = vsubq_s16(b[8], b[11]);
+  a[12] = vsubq_s16(b[15], b[12]);
+  a[13] = vsubq_s16(b[14], b[13]);
+  a[14] = vaddq_s16(b[14], b[13]);
+  a[15] = vaddq_s16(b[15], b[12]);
+
+  a[16] = b[16];
+  a[17] = b[17];
+
+  butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
+  butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
+  butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
+  butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
+
+  a[22] = b[22];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[25] = b[25];
+
+  a[30] = b[30];
+  a[31] = b[31];
+
+  // Stage 5.
+  butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+  butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
+
+  b[4] = vaddq_s16(a[4], a[5]);
+  b[5] = vsubq_s16(a[4], a[5]);
+  b[6] = vsubq_s16(a[7], a[6]);
+  b[7] = vaddq_s16(a[7], a[6]);
+
+  b[8] = a[8];
+
+  butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
+  butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
+
+  b[11] = a[11];
+  b[12] = a[12];
+
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(a[19], a[16]);
+  b[17] = vaddq_s16(a[18], a[17]);
+  b[18] = vsubq_s16(a[17], a[18]);
+  b[19] = vsubq_s16(a[16], a[19]);
+  b[20] = vsubq_s16(a[23], a[20]);
+  b[21] = vsubq_s16(a[22], a[21]);
+  b[22] = vaddq_s16(a[21], a[22]);
+  b[23] = vaddq_s16(a[20], a[23]);
+  b[24] = vaddq_s16(a[27], a[24]);
+  b[25] = vaddq_s16(a[26], a[25]);
+  b[26] = vsubq_s16(a[25], a[26]);
+  b[27] = vsubq_s16(a[24], a[27]);
+  b[28] = vsubq_s16(a[31], a[28]);
+  b[29] = vsubq_s16(a[30], a[29]);
+  b[30] = vaddq_s16(a[29], a[30]);
+  b[31] = vaddq_s16(a[28], a[31]);
+
+  // Stage 6.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+
+  butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
+  butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
+
+  a[8] = vaddq_s16(b[8], b[9]);
+  a[9] = vsubq_s16(b[8], b[9]);
+  a[10] = vsubq_s16(b[11], b[10]);
+  a[11] = vaddq_s16(b[11], b[10]);
+  a[12] = vaddq_s16(b[12], b[13]);
+  a[13] = vsubq_s16(b[12], b[13]);
+  a[14] = vsubq_s16(b[15], b[14]);
+  a[15] = vaddq_s16(b[15], b[14]);
+
+  a[16] = b[16];
+  a[19] = b[19];
+  a[20] = b[20];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[27] = b[27];
+  a[28] = b[28];
+  a[31] = b[31];
+
+  butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
+  butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
+  butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
+
+  // Stage 7.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+  b[4] = a[4];
+  b[5] = a[5];
+  b[6] = a[6];
+  b[7] = a[7];
+
+  butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
+  butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
+  butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
+  butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
+
+  b[16] = vaddq_s16(a[16], a[17]);
+  b[17] = vsubq_s16(a[16], a[17]);
+  b[18] = vsubq_s16(a[19], a[18]);
+  b[19] = vaddq_s16(a[19], a[18]);
+  b[20] = vaddq_s16(a[20], a[21]);
+  b[21] = vsubq_s16(a[20], a[21]);
+  b[22] = vsubq_s16(a[23], a[22]);
+  b[23] = vaddq_s16(a[23], a[22]);
+  b[24] = vaddq_s16(a[24], a[25]);
+  b[25] = vsubq_s16(a[24], a[25]);
+  b[26] = vsubq_s16(a[27], a[26]);
+  b[27] = vaddq_s16(a[27], a[26]);
+  b[28] = vaddq_s16(a[28], a[29]);
+  b[29] = vsubq_s16(a[28], a[29]);
+  b[30] = vsubq_s16(a[31], a[30]);
+  b[31] = vaddq_s16(a[31], a[30]);
+
+  // Final stage.
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  out[0] = sub_round_shift(b[0]);
+  out[16] = sub_round_shift(b[1]);
+  out[8] = sub_round_shift(b[2]);
+  out[24] = sub_round_shift(b[3]);
+  out[4] = sub_round_shift(b[4]);
+  out[20] = sub_round_shift(b[5]);
+  out[12] = sub_round_shift(b[6]);
+  out[28] = sub_round_shift(b[7]);
+  out[2] = sub_round_shift(b[8]);
+  out[18] = sub_round_shift(b[9]);
+  out[10] = sub_round_shift(b[10]);
+  out[26] = sub_round_shift(b[11]);
+  out[6] = sub_round_shift(b[12]);
+  out[22] = sub_round_shift(b[13]);
+  out[14] = sub_round_shift(b[14]);
+  out[30] = sub_round_shift(b[15]);
+
+  butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
+  out[1] = sub_round_shift(a[1]);
+  out[31] = sub_round_shift(a[31]);
+
+  butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
+  out[17] = sub_round_shift(a[17]);
+  out[15] = sub_round_shift(a[15]);
+
+  butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
+  out[9] = sub_round_shift(a[9]);
+  out[23] = sub_round_shift(a[23]);
+
+  butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
+  out[25] = sub_round_shift(a[25]);
+  out[7] = sub_round_shift(a[7]);
+
+  butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
+  out[5] = sub_round_shift(a[5]);
+  out[27] = sub_round_shift(a[27]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
+  out[21] = sub_round_shift(a[21]);
+  out[11] = sub_round_shift(a[11]);
+
+  butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
+  out[13] = sub_round_shift(a[13]);
+  out[19] = sub_round_shift(a[19]);
+
+  butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
+  out[29] = sub_round_shift(a[29]);
+  out[3] = sub_round_shift(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element)    \
+  do {                                     \
+    dst##_lo[element] = src##_lo[element]; \
+    dst##_hi[element] = src##_hi[element]; \
+  } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
+  do {                                                                       \
+    c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
+    c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+  } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+  do {                                                                     \
+    temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
+    temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
+    c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
+    c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
+  } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+// Like butterfly_one_coeff, but don't narrow results.
+static INLINE void butterfly_one_coeff_s16_s32(
+    const int16x8_t a, const int16x8_t b, const tran_high_t constant,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
+                              add_index, sub_index)                      \
+  do {                                                                   \
+    butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+                                &b##_lo[add_index], &b##_hi[add_index],  \
+                                &b##_lo[sub_index], &b##_hi[sub_index]); \
+  } while (0)
+
+// Like butterfly_one_coeff, but with s32.
+static INLINE void butterfly_one_coeff_s32(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
+  const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
+  const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
+  const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
+  const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
+  const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
+                          sub_index)                                          \
+  do {                                                                        \
+    butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index],           \
+                            a##_lo[right_index], a##_hi[right_index],         \
+                            constant, &b##_lo[add_index], &b##_hi[add_index], \
+                            &b##_lo[sub_index], &b##_hi[sub_index]);          \
+  } while (0)
+
+// Like butterfly_two_coeff, but with s32.
+static INLINE void butterfly_two_coeff_s32(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
+  const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
+  const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
+  const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
+  const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
+  const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
+  const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
+  const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
+                          right_constant, b, add_index, sub_index)             \
+  do {                                                                         \
+    butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
+                            a##_lo[right_index], a##_hi[right_index],          \
+                            left_constant, right_constant, &b##_lo[add_index], \
+                            &b##_hi[add_index], &b##_lo[sub_index],            \
+                            &b##_hi[sub_index]);                               \
+  } while (0)
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
+                                            const int32x4_t a_hi) {
+  const int32x4_t one = vdupq_n_s32(1);
+  const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
+  const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
+  const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
+  const int16x4_t b_lo =
+      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
+  const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
+  const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
+  const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
+  const int16x4_t b_hi =
+      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
+  return vcombine_s16(b_lo, b_hi);
+}
+
+static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+  int32x4_t c_lo[32];
+  int32x4_t c_hi[32];
+  int32x4_t d_lo[32];
+  int32x4_t d_hi[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  b[0] = vaddq_s16(a[0], a[15]);
+  b[1] = vaddq_s16(a[1], a[14]);
+  b[2] = vaddq_s16(a[2], a[13]);
+  b[3] = vaddq_s16(a[3], a[12]);
+  b[4] = vaddq_s16(a[4], a[11]);
+  b[5] = vaddq_s16(a[5], a[10]);
+  b[6] = vaddq_s16(a[6], a[9]);
+  b[7] = vaddq_s16(a[7], a[8]);
+
+  b[8] = vsubq_s16(a[7], a[8]);
+  b[9] = vsubq_s16(a[6], a[9]);
+  b[10] = vsubq_s16(a[5], a[10]);
+  b[11] = vsubq_s16(a[4], a[11]);
+  b[12] = vsubq_s16(a[3], a[12]);
+  b[13] = vsubq_s16(a[2], a[13]);
+  b[14] = vsubq_s16(a[1], a[14]);
+  b[15] = vsubq_s16(a[0], a[15]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+  b[18] = a[18];
+  b[19] = a[19];
+
+  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+  b[28] = a[28];
+  b[29] = a[29];
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 3. With extreme values for input this calculation rolls over int16_t.
+  // The sources for b[0] get added multiple times and, through testing, have
+  // been shown to overflow starting here.
+  ADD_S16_S32(b, 0, 7, c, 0);
+  ADD_S16_S32(b, 1, 6, c, 1);
+  ADD_S16_S32(b, 2, 5, c, 2);
+  ADD_S16_S32(b, 3, 4, c, 3);
+  SUB_S16_S32(b, 3, 4, c, 4);
+  SUB_S16_S32(b, 2, 5, c, 5);
+  SUB_S16_S32(b, 1, 6, c, 6);
+  SUB_S16_S32(b, 0, 7, c, 7);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+  BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  ADD_S16_S32(b, 16, 23, c, 16);
+  ADD_S16_S32(b, 17, 22, c, 17);
+  ADD_S16_S32(b, 18, 21, c, 18);
+  ADD_S16_S32(b, 19, 20, c, 19);
+  SUB_S16_S32(b, 19, 20, c, 20);
+  SUB_S16_S32(b, 18, 21, c, 21);
+  SUB_S16_S32(b, 17, 22, c, 22);
+  SUB_S16_S32(b, 16, 23, c, 23);
+  SUB_S16_S32(b, 31, 24, c, 24);
+  SUB_S16_S32(b, 30, 25, c, 25);
+  SUB_S16_S32(b, 29, 26, c, 26);
+  SUB_S16_S32(b, 28, 27, c, 27);
+  ADD_S16_S32(b, 28, 27, c, 28);
+  ADD_S16_S32(b, 29, 26, c, 29);
+  ADD_S16_S32(b, 30, 25, c, 30);
+  ADD_S16_S32(b, 31, 24, c, 31);
+
+  // Stage 4.
+  ADD_S32(c, 0, 3, d, 0);
+  ADD_S32(c, 1, 2, d, 1);
+  SUB_S32(c, 1, 2, d, 2);
+  SUB_S32(c, 0, 3, d, 3);
+
+  PASS_THROUGH(c, d, 4);
+
+  BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+  PASS_THROUGH(c, d, 7);
+
+  ADDW_S16_S32(c, 11, a, 8, d, 8);
+  ADDW_S16_S32(c, 10, a, 9, d, 9);
+  SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+  SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+  SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+  SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+  ADDW_S16_S32(c, 13, b, 14, d, 14);
+  ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 17);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
+  BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
+  BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
+
+  PASS_THROUGH(c, d, 22);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 25);
+
+  PASS_THROUGH(c, d, 30);
+  PASS_THROUGH(c, d, 31);
+
+  // Stage 5.
+  BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+  BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
+
+  ADD_S32(d, 4, 5, c, 4);
+  SUB_S32(d, 4, 5, c, 5);
+  SUB_S32(d, 7, 6, c, 6);
+  ADD_S32(d, 7, 6, c, 7);
+
+  PASS_THROUGH(d, c, 8);
+
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
+  BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
+
+  PASS_THROUGH(d, c, 11);
+  PASS_THROUGH(d, c, 12);
+  PASS_THROUGH(d, c, 15);
+
+  ADD_S32(d, 16, 19, c, 16);
+  ADD_S32(d, 17, 18, c, 17);
+  SUB_S32(d, 17, 18, c, 18);
+  SUB_S32(d, 16, 19, c, 19);
+  SUB_S32(d, 23, 20, c, 20);
+  SUB_S32(d, 22, 21, c, 21);
+  ADD_S32(d, 22, 21, c, 22);
+  ADD_S32(d, 23, 20, c, 23);
+  ADD_S32(d, 24, 27, c, 24);
+  ADD_S32(d, 25, 26, c, 25);
+  SUB_S32(d, 25, 26, c, 26);
+  SUB_S32(d, 24, 27, c, 27);
+  SUB_S32(d, 31, 28, c, 28);
+  SUB_S32(d, 30, 29, c, 29);
+  ADD_S32(d, 30, 29, c, 30);
+  ADD_S32(d, 31, 28, c, 31);
+
+  // Stage 6.
+  PASS_THROUGH(c, d, 0);
+  PASS_THROUGH(c, d, 1);
+  PASS_THROUGH(c, d, 2);
+  PASS_THROUGH(c, d, 3);
+
+  BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
+  BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
+
+  ADD_S32(c, 8, 9, d, 8);
+  SUB_S32(c, 8, 9, d, 9);
+  SUB_S32(c, 11, 10, d, 10);
+  ADD_S32(c, 11, 10, d, 11);
+  ADD_S32(c, 12, 13, d, 12);
+  SUB_S32(c, 12, 13, d, 13);
+  SUB_S32(c, 15, 14, d, 14);
+  ADD_S32(c, 15, 14, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 19);
+  PASS_THROUGH(c, d, 20);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 27);
+  PASS_THROUGH(c, d, 28);
+  PASS_THROUGH(c, d, 31);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
+  BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
+  BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
+
+  // Stage 7.
+  PASS_THROUGH(d, c, 0);
+  PASS_THROUGH(d, c, 1);
+  PASS_THROUGH(d, c, 2);
+  PASS_THROUGH(d, c, 3);
+  PASS_THROUGH(d, c, 4);
+  PASS_THROUGH(d, c, 5);
+  PASS_THROUGH(d, c, 6);
+  PASS_THROUGH(d, c, 7);
+
+  BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
+  BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
+  BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
+
+  ADD_S32(d, 16, 17, c, 16);
+  SUB_S32(d, 16, 17, c, 17);
+  SUB_S32(d, 19, 18, c, 18);
+  ADD_S32(d, 19, 18, c, 19);
+  ADD_S32(d, 20, 21, c, 20);
+  SUB_S32(d, 20, 21, c, 21);
+  SUB_S32(d, 23, 22, c, 22);
+  ADD_S32(d, 23, 22, c, 23);
+  ADD_S32(d, 24, 25, c, 24);
+  SUB_S32(d, 24, 25, c, 25);
+  SUB_S32(d, 27, 26, c, 26);
+  ADD_S32(d, 27, 26, c, 27);
+  ADD_S32(d, 28, 29, c, 28);
+  SUB_S32(d, 28, 29, c, 29);
+  SUB_S32(d, 31, 30, c, 30);
+  ADD_S32(d, 31, 30, c, 31);
+
+  // Final stage.
+  // Roll rounding into this function so we can pass back int16x8.
+
+  out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
+  out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
+
+  out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
+  out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
+  out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
+  out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
+  out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
+
+  out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
+  out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
+  out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
+  out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
+
+  out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
+  out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
+  out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
+  out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
+  out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
+  out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
+  out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
+  out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
+  out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
+  out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
+  out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
+
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
+  out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
+  out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
+
+  BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
+  out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
+  out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
+
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
+  out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
+  out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
+
+  BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
+  out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
+  out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
+
+  BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
+  out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
+  out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
+  const int16x8_t one = vdupq_n_s16(1);
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
+}
+
+static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  // For the "rd" version, all the values are rounded down after stage 2 to keep
+  // the values in 16 bits.
+  b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
+  b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
+  b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
+  b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
+  b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
+  b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
+  b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
+  b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
+
+  b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
+  b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
+  b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
+  b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
+  b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
+  b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
+  b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
+  b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
+
+  b[16] = add_round_shift_s16(a[16]);
+  b[17] = add_round_shift_s16(a[17]);
+  b[18] = add_round_shift_s16(a[18]);
+  b[19] = add_round_shift_s16(a[19]);
+
+  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+  b[20] = add_round_shift_s16(b[20]);
+  b[21] = add_round_shift_s16(b[21]);
+  b[22] = add_round_shift_s16(b[22]);
+  b[23] = add_round_shift_s16(b[23]);
+  b[24] = add_round_shift_s16(b[24]);
+  b[25] = add_round_shift_s16(b[25]);
+  b[26] = add_round_shift_s16(b[26]);
+  b[27] = add_round_shift_s16(b[27]);
+
+  b[28] = add_round_shift_s16(a[28]);
+  b[29] = add_round_shift_s16(a[29]);
+  b[30] = add_round_shift_s16(a[30]);
+  b[31] = add_round_shift_s16(a[31]);
+
+  // Stage 3.
+  a[0] = vaddq_s16(b[0], b[7]);
+  a[1] = vaddq_s16(b[1], b[6]);
+  a[2] = vaddq_s16(b[2], b[5]);
+  a[3] = vaddq_s16(b[3], b[4]);
+
+  a[4] = vsubq_s16(b[3], b[4]);
+  a[5] = vsubq_s16(b[2], b[5]);
+  a[6] = vsubq_s16(b[1], b[6]);
+  a[7] = vsubq_s16(b[0], b[7]);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]);
+  butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  a[16] = vaddq_s16(b[16], b[23]);
+  a[17] = vaddq_s16(b[17], b[22]);
+  a[18] = vaddq_s16(b[18], b[21]);
+  a[19] = vaddq_s16(b[19], b[20]);
+
+  a[20] = vsubq_s16(b[19], b[20]);
+  a[21] = vsubq_s16(b[18], b[21]);
+  a[22] = vsubq_s16(b[17], b[22]);
+  a[23] = vsubq_s16(b[16], b[23]);
+
+  a[24] = vsubq_s16(b[31], b[24]);
+  a[25] = vsubq_s16(b[30], b[25]);
+  a[26] = vsubq_s16(b[29], b[26]);
+  a[27] = vsubq_s16(b[28], b[27]);
+
+  a[28] = vaddq_s16(b[28], b[27]);
+  a[29] = vaddq_s16(b[29], b[26]);
+  a[30] = vaddq_s16(b[30], b[25]);
+  a[31] = vaddq_s16(b[31], b[24]);
+
+  // Stage 4.
+  b[0] = vaddq_s16(a[0], a[3]);
+  b[1] = vaddq_s16(a[1], a[2]);
+  b[2] = vsubq_s16(a[1], a[2]);
+  b[3] = vsubq_s16(a[0], a[3]);
+
+  b[4] = a[4];
+
+  butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]);
+
+  b[7] = a[7];
+
+  b[8] = vaddq_s16(a[8], a[11]);
+  b[9] = vaddq_s16(a[9], a[10]);
+  b[10] = vsubq_s16(a[9], a[10]);
+  b[11] = vsubq_s16(a[8], a[11]);
+  b[12] = vsubq_s16(a[15], a[12]);
+  b[13] = vsubq_s16(a[14], a[13]);
+  b[14] = vaddq_s16(a[14], a[13]);
+  b[15] = vaddq_s16(a[15], a[12]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+
+  butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]);
+  butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]);
+  butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]);
+  butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]);
+
+  b[22] = a[22];
+  b[23] = a[23];
+  b[24] = a[24];
+  b[25] = a[25];
+
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 5.
+  butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]);
+  butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]);
+
+  a[4] = vaddq_s16(b[4], b[5]);
+  a[5] = vsubq_s16(b[4], b[5]);
+  a[6] = vsubq_s16(b[7], b[6]);
+  a[7] = vaddq_s16(b[7], b[6]);
+
+  a[8] = b[8];
+
+  butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]);
+  butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]);
+
+  a[11] = b[11];
+  a[12] = b[12];
+
+  a[15] = b[15];
+
+  a[16] = vaddq_s16(b[19], b[16]);
+  a[17] = vaddq_s16(b[18], b[17]);
+  a[18] = vsubq_s16(b[17], b[18]);
+  a[19] = vsubq_s16(b[16], b[19]);
+  a[20] = vsubq_s16(b[23], b[20]);
+  a[21] = vsubq_s16(b[22], b[21]);
+  a[22] = vaddq_s16(b[21], b[22]);
+  a[23] = vaddq_s16(b[20], b[23]);
+  a[24] = vaddq_s16(b[27], b[24]);
+  a[25] = vaddq_s16(b[26], b[25]);
+  a[26] = vsubq_s16(b[25], b[26]);
+  a[27] = vsubq_s16(b[24], b[27]);
+  a[28] = vsubq_s16(b[31], b[28]);
+  a[29] = vsubq_s16(b[30], b[29]);
+  a[30] = vaddq_s16(b[29], b[30]);
+  a[31] = vaddq_s16(b[28], b[31]);
+
+  // Stage 6.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+
+  butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]);
+  butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]);
+
+  b[8] = vaddq_s16(a[8], a[9]);
+  b[9] = vsubq_s16(a[8], a[9]);
+  b[10] = vsubq_s16(a[11], a[10]);
+  b[11] = vaddq_s16(a[11], a[10]);
+  b[12] = vaddq_s16(a[12], a[13]);
+  b[13] = vsubq_s16(a[12], a[13]);
+  b[14] = vsubq_s16(a[15], a[14]);
+  b[15] = vaddq_s16(a[15], a[14]);
+
+  b[16] = a[16];
+  b[19] = a[19];
+  b[20] = a[20];
+  b[23] = a[23];
+  b[24] = a[24];
+  b[27] = a[27];
+  b[28] = a[28];
+  b[31] = a[31];
+
+  butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]);
+  butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]);
+
+  butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]);
+  butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]);
+
+  // Stage 7.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+
+  butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]);
+  butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]);
+  butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]);
+  butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]);
+
+  a[16] = vaddq_s16(b[16], b[17]);
+  a[17] = vsubq_s16(b[16], b[17]);
+  a[18] = vsubq_s16(b[19], b[18]);
+  a[19] = vaddq_s16(b[19], b[18]);
+  a[20] = vaddq_s16(b[20], b[21]);
+  a[21] = vsubq_s16(b[20], b[21]);
+  a[22] = vsubq_s16(b[23], b[22]);
+  a[23] = vaddq_s16(b[23], b[22]);
+  a[24] = vaddq_s16(b[24], b[25]);
+  a[25] = vsubq_s16(b[24], b[25]);
+  a[26] = vsubq_s16(b[27], b[26]);
+  a[27] = vaddq_s16(b[27], b[26]);
+  a[28] = vaddq_s16(b[28], b[29]);
+  a[29] = vsubq_s16(b[28], b[29]);
+  a[30] = vsubq_s16(b[31], b[30]);
+  a[31] = vaddq_s16(b[31], b[30]);
+
+  // Final stage.
+  out[0] = a[0];
+  out[16] = a[1];
+  out[8] = a[2];
+  out[24] = a[3];
+  out[4] = a[4];
+  out[20] = a[5];
+  out[12] = a[6];
+  out[28] = a[7];
+  out[2] = a[8];
+  out[18] = a[9];
+  out[10] = a[10];
+  out[26] = a[11];
+  out[6] = a[12];
+  out[22] = a[13];
+  out[14] = a[14];
+  out[30] = a[15];
+
+  butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]);
+  butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17],
+                      &out[15]);
+  butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]);
+  butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]);
+  butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]);
+  butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21],
+                      &out[11]);
+  butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13],
+                      &out[19]);
+  butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
+// are all in-place.
+// TODO(johannkoenig): share with other fdcts.
+static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
+  // Swap 16 bit elements.
+  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements.
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+                                   vreinterpretq_s32_s16(c3.val[0]));
+  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+                                   vreinterpretq_s32_s16(c3.val[1]));
+
+  // Swap 64 bit elements
+  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+  b[0] = e0.val[0];
+  b[1] = e1.val[0];
+  b[2] = e2.val[0];
+  b[3] = e3.val[0];
+  b[4] = e0.val[1];
+  b[5] = e1.val[1];
+  b[6] = e2.val[1];
+  b[7] = e3.val[1];
+}
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+  int16x8_t temp0[32];
+  int16x8_t temp1[32];
+  int16x8_t temp2[32];
+  int16x8_t temp3[32];
+  int16x8_t temp4[32];
+  int16x8_t temp5[32];
+
+  // Process in 8x32 columns.
+  load(input, stride, temp0);
+  dct_body_first_pass(temp0, temp1);
+
+  load(input + 8, stride, temp0);
+  dct_body_first_pass(temp0, temp2);
+
+  load(input + 16, stride, temp0);
+  dct_body_first_pass(temp0, temp3);
+
+  load(input + 24, stride, temp0);
+  dct_body_first_pass(temp0, temp4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_8x8(&temp1[0], &temp0[0]);
+  transpose_8x8(&temp2[0], &temp0[8]);
+  transpose_8x8(&temp3[0], &temp0[16]);
+  transpose_8x8(&temp4[0], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output, temp5);
+
+  // Second row of 8x32.
+  transpose_8x8(&temp1[8], &temp0[0]);
+  transpose_8x8(&temp2[8], &temp0[8]);
+  transpose_8x8(&temp3[8], &temp0[16]);
+  transpose_8x8(&temp4[8], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 8 * 32, temp5);
+
+  // Third row of 8x32
+  transpose_8x8(&temp1[16], &temp0[0]);
+  transpose_8x8(&temp2[16], &temp0[8]);
+  transpose_8x8(&temp3[16], &temp0[16]);
+  transpose_8x8(&temp4[16], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 16 * 32, temp5);
+
+  // Final row of 8x32.
+  transpose_8x8(&temp1[24], &temp0[0]);
+  transpose_8x8(&temp2[24], &temp0[8]);
+  transpose_8x8(&temp3[24], &temp0[16]);
+  transpose_8x8(&temp4[24], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 24 * 32, temp5);
+}
+
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+                           int stride) {
+  int16x8_t temp0[32];
+  int16x8_t temp1[32];
+  int16x8_t temp2[32];
+  int16x8_t temp3[32];
+  int16x8_t temp4[32];
+  int16x8_t temp5[32];
+
+  // Process in 8x32 columns.
+  load(input, stride, temp0);
+  dct_body_first_pass(temp0, temp1);
+
+  load(input + 8, stride, temp0);
+  dct_body_first_pass(temp0, temp2);
+
+  load(input + 16, stride, temp0);
+  dct_body_first_pass(temp0, temp3);
+
+  load(input + 24, stride, temp0);
+  dct_body_first_pass(temp0, temp4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_8x8(&temp1[0], &temp0[0]);
+  transpose_8x8(&temp2[0], &temp0[8]);
+  transpose_8x8(&temp3[0], &temp0[16]);
+  transpose_8x8(&temp4[0], &temp0[24]);
+
+  dct_body_second_pass_rd(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output, temp5);
+
+  // Second row of 8x32.
+  transpose_8x8(&temp1[8], &temp0[0]);
+  transpose_8x8(&temp2[8], &temp0[8]);
+  transpose_8x8(&temp3[8], &temp0[16]);
+  transpose_8x8(&temp4[8], &temp0[24]);
+
+  dct_body_second_pass_rd(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 8 * 32, temp5);
+
+  // Third row of 8x32
+  transpose_8x8(&temp1[16], &temp0[0]);
+  transpose_8x8(&temp2[16], &temp0[8]);
+  transpose_8x8(&temp3[16], &temp0[16]);
+  transpose_8x8(&temp4[16], &temp0[24]);
+
+  dct_body_second_pass_rd(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 16 * 32, temp5);
+
+  // Final row of 8x32.
+  transpose_8x8(&temp1[24], &temp0[0]);
+  transpose_8x8(&temp2[24], &temp0[8]);
+  transpose_8x8(&temp3[24], &temp0[16]);
+  transpose_8x8(&temp4[24], &temp0[24]);
+
+  dct_body_second_pass_rd(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 24 * 32, temp5);
+}
+#endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
+        // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
diff --git a/libs/libvpx/vpx_dsp/arm/fdct_neon.c b/libs/libvpx/vpx_dsp/arm/fdct_neon.c
new file mode 100644
index 0000000000..04646ed2e0
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/fdct_neon.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+                      int stride) {
+  int i;
+  // input[M * stride] * 16
+  int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+  int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+  int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+  int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  // If the very first value != 0, then add 1.
+  if (input[0] != 0) {
+    const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
+    input_0 = vadd_s16(input_0, one);
+  }
+
+  for (i = 0; i < 2; ++i) {
+    const int16x8_t input_01 = vcombine_s16(input_0, input_1);
+    const int16x8_t input_32 = vcombine_s16(input_3, input_2);
+
+    // in_0 +/- in_3, in_1 +/- in_2
+    const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+    const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+    // step_0 +/- step_1, step_2 +/- step_3
+    const int16x4_t s_0 = vget_low_s16(s_01);
+    const int16x4_t s_1 = vget_high_s16(s_01);
+    const int16x4_t s_2 = vget_high_s16(s_32);
+    const int16x4_t s_3 = vget_low_s16(s_32);
+
+    // (s_0 +/- s_1) * cospi_16_64
+    // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+    const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
+    const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
+    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
+    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
+
+    // fdct_round_shift
+    int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
+    int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
+
+    // s_3 * cospi_8_64 + s_2 * cospi_24_64
+    // s_3 * cospi_24_64 - s_2 * cospi_8_64
+    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
+    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
+
+    const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
+    const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
+
+    // fdct_round_shift
+    int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
+    int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
+
+    transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+
+    input_0 = out_0;
+    input_1 = out_1;
+    input_2 = out_2;
+    input_3 = out_3;
+  }
+
+  {
+    // Not quite a rounding shift. Only add 1 despite shifting by 2.
+    const int16x8_t one = vdupq_n_s16(1);
+    int16x8_t out_01 = vcombine_s16(input_0, input_1);
+    int16x8_t out_23 = vcombine_s16(input_2, input_3);
+    out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+    out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+    store_s16q_to_tran_low(final_output + 0 * 8, out_01);
+    store_s16q_to_tran_low(final_output + 1 * 8, out_23);
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/fdct_partial_neon.c b/libs/libvpx/vpx_dsp/arm/fdct_partial_neon.c
new file mode 100644
index 0000000000..e73de41d77
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/fdct_partial_neon.c
@@ -0,0 +1,113 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE tran_low_t get_lane(const int32x2_t a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  return vget_lane_s32(a, 0);
+#else
+  return vget_lane_s16(vreinterpret_s16_s32(a), 0);
+#endif  // CONFIG_VP9_HIGHBITDETPH
+}
+
+void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
+  int16x4_t a0, a1, a2, a3;
+  int16x8_t b0, b1;
+  int16x8_t c;
+  int32x2_t d;
+
+  a0 = vld1_s16(input);
+  input += stride;
+  a1 = vld1_s16(input);
+  input += stride;
+  a2 = vld1_s16(input);
+  input += stride;
+  a3 = vld1_s16(input);
+
+  b0 = vcombine_s16(a0, a1);
+  b1 = vcombine_s16(a2, a3);
+
+  c = vaddq_s16(b0, b1);
+
+  d = horizontal_add_int16x8(c);
+
+  output[0] = get_lane(vshl_n_s32(d, 1));
+  output[1] = 0;
+}
+
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
+  int r;
+  int16x8_t sum = vld1q_s16(&input[0]);
+
+  for (r = 1; r < 8; ++r) {
+    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+    sum = vaddq_s16(sum, input_00);
+  }
+
+  output[0] = get_lane(horizontal_add_int16x8(sum));
+  output[1] = 0;
+}
+
+void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  int r;
+  int16x8_t left = vld1q_s16(input);
+  int16x8_t right = vld1q_s16(input + 8);
+  int32x2_t sum;
+  input += stride;
+
+  for (r = 1; r < 16; ++r) {
+    const int16x8_t a = vld1q_s16(input);
+    const int16x8_t b = vld1q_s16(input + 8);
+    input += stride;
+    left = vaddq_s16(left, a);
+    right = vaddq_s16(right, b);
+  }
+
+  sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right));
+
+  output[0] = get_lane(vshr_n_s32(sum, 1));
+  output[1] = 0;
+}
+
+void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  int r;
+  int16x8_t a0 = vld1q_s16(input);
+  int16x8_t a1 = vld1q_s16(input + 8);
+  int16x8_t a2 = vld1q_s16(input + 16);
+  int16x8_t a3 = vld1q_s16(input + 24);
+  int32x2_t sum;
+  input += stride;
+
+  for (r = 1; r < 32; ++r) {
+    const int16x8_t b0 = vld1q_s16(input);
+    const int16x8_t b1 = vld1q_s16(input + 8);
+    const int16x8_t b2 = vld1q_s16(input + 16);
+    const int16x8_t b3 = vld1q_s16(input + 24);
+    input += stride;
+    a0 = vaddq_s16(a0, b0);
+    a1 = vaddq_s16(a1, b1);
+    a2 = vaddq_s16(a2, b2);
+    a3 = vaddq_s16(a3, b3);
+  }
+
+  sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1));
+  sum = vadd_s32(sum, horizontal_add_int16x8(a2));
+  sum = vadd_s32(sum, horizontal_add_int16x8(a3));
+  output[0] = get_lane(vshr_n_s32(sum, 3));
+  output[1] = 0;
+}
diff --git a/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
index 7cb2ba90d2..8049277b13 100644
--- a/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
@@ -12,8 +12,12 @@
 
 #include "./vpx_config.h"
 #include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 
-void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+                      int stride) {
   int i;
   // stage 1
   int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
@@ -44,14 +48,14 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
     int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
     int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
     int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
-    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
-    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
-    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
-    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
-    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
+    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
+    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
+    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
+    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
+    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
     v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
     v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
     v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
@@ -73,10 +77,10 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
     // Stage 2
     v_x0 = vsubq_s16(v_s6, v_s5);
     v_x1 = vaddq_s16(v_s6, v_s5);
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
     {
       const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
       const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
@@ -91,22 +95,22 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
       v_x3 = vaddq_s16(v_s7, cd);
     }
     // Stage 4
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
-    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
-    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
-    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
-    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
-    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
-    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
-    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
-    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
-    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
-    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
+    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
+    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
+    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
+    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
+    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
+    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
+    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
+    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
+    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
+    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
     {
       const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
       const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
@@ -122,6 +126,8 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
       out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
     }
     // transpose 8x8
+    // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
+    // columns.
     {
       // 00 01 02 03 40 41 42 43
       // 10 11 12 13 50 51 52 53
@@ -191,30 +197,13 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
     input_6 = vhsubq_s16(input_6, sign_in6);
     input_7 = vhsubq_s16(input_7, sign_in7);
     // store results
-    vst1q_s16(&final_output[0 * 8], input_0);
-    vst1q_s16(&final_output[1 * 8], input_1);
-    vst1q_s16(&final_output[2 * 8], input_2);
-    vst1q_s16(&final_output[3 * 8], input_3);
-    vst1q_s16(&final_output[4 * 8], input_4);
-    vst1q_s16(&final_output[5 * 8], input_5);
-    vst1q_s16(&final_output[6 * 8], input_6);
-    vst1q_s16(&final_output[7 * 8], input_7);
-  }
-}
-
-void vpx_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
-  int r;
-  int16x8_t sum = vld1q_s16(&input[0]);
-  for (r = 1; r < 8; ++r) {
-    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
-    sum = vaddq_s16(sum, input_00);
-  }
-  {
-    const int32x4_t a = vpaddlq_s16(sum);
-    const int64x2_t b = vpaddlq_s32(a);
-    const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                                 vreinterpret_s32_s64(vget_high_s64(b)));
-    output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
-    output[1] = 0;
+    store_s16q_to_tran_low(final_output + 0 * 8, input_0);
+    store_s16q_to_tran_low(final_output + 1 * 8, input_1);
+    store_s16q_to_tran_low(final_output + 2 * 8, input_2);
+    store_s16q_to_tran_low(final_output + 3 * 8, input_3);
+    store_s16q_to_tran_low(final_output + 4 * 8, input_4);
+    store_s16q_to_tran_low(final_output + 5 * 8, input_5);
+    store_s16q_to_tran_low(final_output + 6 * 8, input_6);
+    store_s16q_to_tran_low(final_output + 7 * 8, input_7);
   }
 }
diff --git a/libs/libvpx/vpx_dsp/arm/hadamard_neon.c b/libs/libvpx/vpx_dsp/arm/hadamard_neon.c
index 977323497a..523a63c6f7 100644
--- a/libs/libvpx/vpx_dsp/arm/hadamard_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/hadamard_neon.c
@@ -11,6 +11,9 @@
 #include <arm_neon.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 
 static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
@@ -44,8 +47,8 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
   *a7 = vaddq_s16(c1, c5);
 }
 
-void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
-                           int16_t *coeff) {
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
   int16x8_t a0 = vld1q_s16(src_diff);
   int16x8_t a1 = vld1q_s16(src_diff + src_stride);
   int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
@@ -63,18 +66,18 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
 
   // Skip the second transpose because it is not required.
 
-  vst1q_s16(coeff + 0, a0);
-  vst1q_s16(coeff + 8, a1);
-  vst1q_s16(coeff + 16, a2);
-  vst1q_s16(coeff + 24, a3);
-  vst1q_s16(coeff + 32, a4);
-  vst1q_s16(coeff + 40, a5);
-  vst1q_s16(coeff + 48, a6);
-  vst1q_s16(coeff + 56, a7);
+  store_s16q_to_tran_low(coeff + 0, a0);
+  store_s16q_to_tran_low(coeff + 8, a1);
+  store_s16q_to_tran_low(coeff + 16, a2);
+  store_s16q_to_tran_low(coeff + 24, a3);
+  store_s16q_to_tran_low(coeff + 32, a4);
+  store_s16q_to_tran_low(coeff + 40, a5);
+  store_s16q_to_tran_low(coeff + 48, a6);
+  store_s16q_to_tran_low(coeff + 56, a7);
 }
 
-void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
-                             int16_t *coeff) {
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
   int i;
 
   /* Rearrange 16x16 to 8x32 and remove stride.
@@ -88,10 +91,10 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
   vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
 
   for (i = 0; i < 64; i += 8) {
-    const int16x8_t a0 = vld1q_s16(coeff + 0);
-    const int16x8_t a1 = vld1q_s16(coeff + 64);
-    const int16x8_t a2 = vld1q_s16(coeff + 128);
-    const int16x8_t a3 = vld1q_s16(coeff + 192);
+    const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+    const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64);
+    const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128);
+    const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192);
 
     const int16x8_t b0 = vhaddq_s16(a0, a1);
     const int16x8_t b1 = vhsubq_s16(a0, a1);
@@ -103,10 +106,10 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
     const int16x8_t c2 = vsubq_s16(b0, b2);
     const int16x8_t c3 = vsubq_s16(b1, b3);
 
-    vst1q_s16(coeff + 0, c0);
-    vst1q_s16(coeff + 64, c1);
-    vst1q_s16(coeff + 128, c2);
-    vst1q_s16(coeff + 192, c3);
+    store_s16q_to_tran_low(coeff + 0, c0);
+    store_s16q_to_tran_low(coeff + 64, c1);
+    store_s16q_to_tran_low(coeff + 128, c2);
+    store_s16q_to_tran_low(coeff + 192, c3);
 
     coeff += 8;
   }
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
new file mode 100644
index 0000000000..5358839b53
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -0,0 +1,1437 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct16x16_add_wrap_low_8x2(const int64x2x2_t *const t,
+                                                     int32x4x2_t *const d0,
+                                                     int32x4x2_t *const d1) {
+  int32x2x2_t t32[4];
+
+  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
+  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
+  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
+  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
+  t32[2].val[0] = vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS);
+  t32[2].val[1] = vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS);
+  t32[3].val[0] = vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS);
+  t32[3].val[1] = vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS);
+  d0->val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]);
+  d0->val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]);
+  d1->val[0] = vcombine_s32(t32[2].val[0], t32[2].val[1]);
+  d1->val[1] = vcombine_s32(t32[3].val[0], t32[3].val[1]);
+}
+
+static INLINE void highbd_idct16x16_add_wrap_low_4x2(const int64x2x2_t *const t,
+                                                     int32x4_t *const d0,
+                                                     int32x4_t *const d1) {
+  int32x2x2_t t32[2];
+
+  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
+  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
+  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
+  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
+  *d0 = vcombine_s32(t32[0].val[0], t32[0].val[1]);
+  *d1 = vcombine_s32(t32[1].val[0], t32[1].val[1]);
+}
+
+static INLINE int32x4x2_t
+highbd_idct16x16_add_wrap_low_8x1(const int64x2x2_t *const t) {
+  int32x2x2_t t32[2];
+  int32x4x2_t d;
+
+  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
+  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
+  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
+  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
+  d.val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]);
+  d.val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]);
+  return d;
+}
+
+static INLINE int32x4_t highbd_idct16x16_add_wrap_low_4x1(const int64x2x2_t t) {
+  int32x2x2_t t32;
+
+  t32.val[0] = vrshrn_n_s64(t.val[0], DCT_CONST_BITS);
+  t32.val[1] = vrshrn_n_s64(t.val[1], DCT_CONST_BITS);
+  return vcombine_s32(t32.val[0], t32.val[1]);
+}
+
+static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0,
+                                          const int32x4x2_t s1,
+                                          const int32x4_t cospi_2_30_10_22,
+                                          int32x4x2_t *const d0,
+                                          int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0,
+                                          const int32x4x2_t s1,
+                                          const int32x4_t cospi_4_12_20N_28,
+                                          int32x4x2_t *const d0,
+                                          int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0,
+                                          const int32x4x2_t s1,
+                                          const int32x4_t cospi_6_26N_14_18N,
+                                          int32x4x2_t *const d0,
+                                          int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0,
+                                           const int32x4x2_t s1,
+                                           const int32x4_t cospi_2_30_10_22,
+                                           int32x4x2_t *const d0,
+                                           int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0,
+                                           const int32x4x2_t s1,
+                                           const int32x4_t cospi_4_12_20N_28,
+                                           int32x4x2_t *const d0,
+                                           int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0,
+                                           const int32x4x2_t s1,
+                                           const int32x4_t cospi_6_26N_14_18N,
+                                           int32x4x2_t *const d0,
+                                           int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_q_kernel(
+    const int32x4x2_t s0, const int32x4x2_t s1, const int32x4_t cospi_0_8_16_24,
+    int64x2x2_t *const t) {
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_d_kernel(
+    const int32x4_t s0, const int32x4_t s1, const int32x4_t cospi_0_8_16_24,
+    int64x2x2_t *const t) {
+  t[0].val[0] =
+      vmull_lane_s32(vget_low_s32(s0), vget_high_s32(cospi_0_8_16_24), 1);
+  t[0].val[1] =
+      vmull_lane_s32(vget_high_s32(s0), vget_high_s32(cospi_0_8_16_24), 1);
+  t[1].val[0] =
+      vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 1);
+  t[1].val[1] =
+      vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 1);
+  t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s0),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s0),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0,
+                                            const int32x4x2_t s1,
+                                            const int32x4_t cospi_0_8_16_24,
+                                            int32x4x2_t *const d0,
+                                            int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
+  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0,
+                                            const int32x4_t s1,
+                                            const int32x4_t cospi_0_8_16_24,
+                                            int32x4_t *const d0,
+                                            int32x4_t *const d1) {
+  int64x2x2_t t[2];
+
+  highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
+  highbd_idct16x16_add_wrap_low_4x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0,
+                                                const int32x4x2_t s1,
+                                                const int32x4_t cospi_0_8_16_24,
+                                                int32x4x2_t *const d0,
+                                                int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
+  t[2].val[0] = vsubq_s64(vdupq_n_s64(0), t[2].val[0]);
+  t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]);
+  t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]);
+  t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]);
+  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0,
+                                                const int32x4_t s1,
+                                                const int32x4_t cospi_0_8_16_24,
+                                                int32x4_t *const d0,
+                                                int32x4_t *const d1) {
+  int64x2x2_t t[2];
+
+  highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
+  t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]);
+  t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]);
+  highbd_idct16x16_add_wrap_low_4x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0,
+                                             const int32x4x2_t s1,
+                                             const int32x4_t cospi_0_8_16_24,
+                                             int32x4x2_t *const d0,
+                                             int32x4x2_t *const d1) {
+  int64x2x2_t t[6];
+
+  t[4].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[4].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[5].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[5].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[0].val[0] = vmlsl_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[0].val[1] = vmlsl_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[1].val[0] = vmlsl_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[1].val[1] = vmlsl_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[2].val[0] = vmlal_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[2].val[1] = vmlal_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[3].val[0] = vmlal_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0,
+                                             const int32x4_t s1,
+                                             const int32x4_t cospi_0_8_16_24,
+                                             int32x4_t *const d0,
+                                             int32x4_t *const d1) {
+  int64x2x2_t t[3];
+
+  t[2].val[0] =
+      vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 0);
+  t[2].val[1] =
+      vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 0);
+  t[0].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[0].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[1].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  highbd_idct16x16_add_wrap_low_4x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct16x16_add_stage7_dual(
+    const int32x4x2_t *const step2, int32x4x2_t *const out) {
+  out[0].val[0] = vaddq_s32(step2[0].val[0], step2[15].val[0]);
+  out[0].val[1] = vaddq_s32(step2[0].val[1], step2[15].val[1]);
+  out[1].val[0] = vaddq_s32(step2[1].val[0], step2[14].val[0]);
+  out[1].val[1] = vaddq_s32(step2[1].val[1], step2[14].val[1]);
+  out[2].val[0] = vaddq_s32(step2[2].val[0], step2[13].val[0]);
+  out[2].val[1] = vaddq_s32(step2[2].val[1], step2[13].val[1]);
+  out[3].val[0] = vaddq_s32(step2[3].val[0], step2[12].val[0]);
+  out[3].val[1] = vaddq_s32(step2[3].val[1], step2[12].val[1]);
+  out[4].val[0] = vaddq_s32(step2[4].val[0], step2[11].val[0]);
+  out[4].val[1] = vaddq_s32(step2[4].val[1], step2[11].val[1]);
+  out[5].val[0] = vaddq_s32(step2[5].val[0], step2[10].val[0]);
+  out[5].val[1] = vaddq_s32(step2[5].val[1], step2[10].val[1]);
+  out[6].val[0] = vaddq_s32(step2[6].val[0], step2[9].val[0]);
+  out[6].val[1] = vaddq_s32(step2[6].val[1], step2[9].val[1]);
+  out[7].val[0] = vaddq_s32(step2[7].val[0], step2[8].val[0]);
+  out[7].val[1] = vaddq_s32(step2[7].val[1], step2[8].val[1]);
+  out[8].val[0] = vsubq_s32(step2[7].val[0], step2[8].val[0]);
+  out[8].val[1] = vsubq_s32(step2[7].val[1], step2[8].val[1]);
+  out[9].val[0] = vsubq_s32(step2[6].val[0], step2[9].val[0]);
+  out[9].val[1] = vsubq_s32(step2[6].val[1], step2[9].val[1]);
+  out[10].val[0] = vsubq_s32(step2[5].val[0], step2[10].val[0]);
+  out[10].val[1] = vsubq_s32(step2[5].val[1], step2[10].val[1]);
+  out[11].val[0] = vsubq_s32(step2[4].val[0], step2[11].val[0]);
+  out[11].val[1] = vsubq_s32(step2[4].val[1], step2[11].val[1]);
+  out[12].val[0] = vsubq_s32(step2[3].val[0], step2[12].val[0]);
+  out[12].val[1] = vsubq_s32(step2[3].val[1], step2[12].val[1]);
+  out[13].val[0] = vsubq_s32(step2[2].val[0], step2[13].val[0]);
+  out[13].val[1] = vsubq_s32(step2[2].val[1], step2[13].val[1]);
+  out[14].val[0] = vsubq_s32(step2[1].val[0], step2[14].val[0]);
+  out[14].val[1] = vsubq_s32(step2[1].val[1], step2[14].val[1]);
+  out[15].val[0] = vsubq_s32(step2[0].val[0], step2[15].val[0]);
+  out[15].val[1] = vsubq_s32(step2[0].val[1], step2[15].val[1]);
+}
+
+static INLINE void highbd_idct16x16_add_stage7(const int32x4_t *const step2,
+                                               int32x4_t *const out) {
+  out[0] = vaddq_s32(step2[0], step2[15]);
+  out[1] = vaddq_s32(step2[1], step2[14]);
+  out[2] = vaddq_s32(step2[2], step2[13]);
+  out[3] = vaddq_s32(step2[3], step2[12]);
+  out[4] = vaddq_s32(step2[4], step2[11]);
+  out[5] = vaddq_s32(step2[5], step2[10]);
+  out[6] = vaddq_s32(step2[6], step2[9]);
+  out[7] = vaddq_s32(step2[7], step2[8]);
+  out[8] = vsubq_s32(step2[7], step2[8]);
+  out[9] = vsubq_s32(step2[6], step2[9]);
+  out[10] = vsubq_s32(step2[5], step2[10]);
+  out[11] = vsubq_s32(step2[4], step2[11]);
+  out[12] = vsubq_s32(step2[3], step2[12]);
+  out[13] = vsubq_s32(step2[2], step2[13]);
+  out[14] = vsubq_s32(step2[1], step2[14]);
+  out[15] = vsubq_s32(step2[0], step2[15]);
+}
+
+static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out,
+                                                int32_t *output) {
+  // Save the result into output
+  vst1q_s32(output + 0, out[0].val[0]);
+  vst1q_s32(output + 4, out[0].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[1].val[0]);
+  vst1q_s32(output + 4, out[1].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[2].val[0]);
+  vst1q_s32(output + 4, out[2].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[3].val[0]);
+  vst1q_s32(output + 4, out[3].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[4].val[0]);
+  vst1q_s32(output + 4, out[4].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[5].val[0]);
+  vst1q_s32(output + 4, out[5].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[6].val[0]);
+  vst1q_s32(output + 4, out[6].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[7].val[0]);
+  vst1q_s32(output + 4, out[7].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[8].val[0]);
+  vst1q_s32(output + 4, out[8].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[9].val[0]);
+  vst1q_s32(output + 4, out[9].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[10].val[0]);
+  vst1q_s32(output + 4, out[10].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[11].val[0]);
+  vst1q_s32(output + 4, out[11].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[12].val[0]);
+  vst1q_s32(output + 4, out[12].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[13].val[0]);
+  vst1q_s32(output + 4, out[13].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[14].val[0]);
+  vst1q_s32(output + 4, out[14].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[15].val[0]);
+  vst1q_s32(output + 4, out[15].val[1]);
+}
+
+static void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input,
+                                                int32_t *output, uint16_t *dest,
+                                                const int stride,
+                                                const int bd) {
+  const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+  const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+  const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+  const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+  int32x4x2_t in[16], step1[16], step2[16], out[16];
+
+  // Load input (16x8)
+  in[0].val[0] = vld1q_s32(input);
+  in[0].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[8].val[0] = vld1q_s32(input);
+  in[8].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[1].val[0] = vld1q_s32(input);
+  in[1].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[9].val[0] = vld1q_s32(input);
+  in[9].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[2].val[0] = vld1q_s32(input);
+  in[2].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[10].val[0] = vld1q_s32(input);
+  in[10].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[3].val[0] = vld1q_s32(input);
+  in[3].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[11].val[0] = vld1q_s32(input);
+  in[11].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[4].val[0] = vld1q_s32(input);
+  in[4].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[12].val[0] = vld1q_s32(input);
+  in[12].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[5].val[0] = vld1q_s32(input);
+  in[5].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[13].val[0] = vld1q_s32(input);
+  in[13].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[6].val[0] = vld1q_s32(input);
+  in[6].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[14].val[0] = vld1q_s32(input);
+  in[14].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[7].val[0] = vld1q_s32(input);
+  in[7].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[15].val[0] = vld1q_s32(input);
+  in[15].val[1] = vld1q_s32(input + 4);
+
+  // Transpose
+  transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+  transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+                    &in[15]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[1] = in[16 / 2];
+  step1[2] = in[8 / 2];
+  step1[3] = in[24 / 2];
+  step1[4] = in[4 / 2];
+  step1[5] = in[20 / 2];
+  step1[6] = in[12 / 2];
+  step1[7] = in[28 / 2];
+  step1[8] = in[2 / 2];
+  step1[9] = in[18 / 2];
+  step1[10] = in[10 / 2];
+  step1[11] = in[26 / 2];
+  step1[12] = in[6 / 2];
+  step1[13] = in[22 / 2];
+  step1[14] = in[14 / 2];
+  step1[15] = in[30 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+  highbd_idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8],
+                         &step2[15]);
+  highbd_idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9],
+                          &step2[14]);
+  highbd_idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
+                          &step2[13]);
+  highbd_idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11],
+                         &step2[12]);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+  highbd_idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4],
+                         &step1[7]);
+  highbd_idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5],
+                          &step1[6]);
+  step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[9].val[0]);
+  step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[9].val[1]);
+  step1[9].val[0] = vsubq_s32(step2[8].val[0], step2[9].val[0]);
+  step1[9].val[1] = vsubq_s32(step2[8].val[1], step2[9].val[1]);
+  step1[10].val[0] = vsubq_s32(step2[11].val[0], step2[10].val[0]);
+  step1[10].val[1] = vsubq_s32(step2[11].val[1], step2[10].val[1]);
+  step1[11].val[0] = vaddq_s32(step2[11].val[0], step2[10].val[0]);
+  step1[11].val[1] = vaddq_s32(step2[11].val[1], step2[10].val[1]);
+  step1[12].val[0] = vaddq_s32(step2[12].val[0], step2[13].val[0]);
+  step1[12].val[1] = vaddq_s32(step2[12].val[1], step2[13].val[1]);
+  step1[13].val[0] = vsubq_s32(step2[12].val[0], step2[13].val[0]);
+  step1[13].val[1] = vsubq_s32(step2[12].val[1], step2[13].val[1]);
+  step1[14].val[0] = vsubq_s32(step2[15].val[0], step2[14].val[0]);
+  step1[14].val[1] = vsubq_s32(step2[15].val[1], step2[14].val[1]);
+  step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[14].val[0]);
+  step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[14].val[1]);
+
+  // stage 4
+  highbd_idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1],
+                            &step2[0]);
+  highbd_idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2],
+                           &step2[3]);
+  step2[4].val[0] = vaddq_s32(step1[4].val[0], step1[5].val[0]);
+  step2[4].val[1] = vaddq_s32(step1[4].val[1], step1[5].val[1]);
+  step2[5].val[0] = vsubq_s32(step1[4].val[0], step1[5].val[0]);
+  step2[5].val[1] = vsubq_s32(step1[4].val[1], step1[5].val[1]);
+  step2[6].val[0] = vsubq_s32(step1[7].val[0], step1[6].val[0]);
+  step2[6].val[1] = vsubq_s32(step1[7].val[1], step1[6].val[1]);
+  step2[7].val[0] = vaddq_s32(step1[7].val[0], step1[6].val[0]);
+  step2[7].val[1] = vaddq_s32(step1[7].val[1], step1[6].val[1]);
+  step2[8] = step1[8];
+  highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                           &step2[14]);
+  highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+                               &step2[13], &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0].val[0] = vaddq_s32(step2[0].val[0], step2[3].val[0]);
+  step1[0].val[1] = vaddq_s32(step2[0].val[1], step2[3].val[1]);
+  step1[1].val[0] = vaddq_s32(step2[1].val[0], step2[2].val[0]);
+  step1[1].val[1] = vaddq_s32(step2[1].val[1], step2[2].val[1]);
+  step1[2].val[0] = vsubq_s32(step2[1].val[0], step2[2].val[0]);
+  step1[2].val[1] = vsubq_s32(step2[1].val[1], step2[2].val[1]);
+  step1[3].val[0] = vsubq_s32(step2[0].val[0], step2[3].val[0]);
+  step1[3].val[1] = vsubq_s32(step2[0].val[1], step2[3].val[1]);
+  step1[4] = step2[4];
+  highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+                            &step1[6]);
+  step1[7] = step2[7];
+  step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[11].val[0]);
+  step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[11].val[1]);
+  step1[9].val[0] = vaddq_s32(step2[9].val[0], step2[10].val[0]);
+  step1[9].val[1] = vaddq_s32(step2[9].val[1], step2[10].val[1]);
+  step1[10].val[0] = vsubq_s32(step2[9].val[0], step2[10].val[0]);
+  step1[10].val[1] = vsubq_s32(step2[9].val[1], step2[10].val[1]);
+  step1[11].val[0] = vsubq_s32(step2[8].val[0], step2[11].val[0]);
+  step1[11].val[1] = vsubq_s32(step2[8].val[1], step2[11].val[1]);
+  step1[12].val[0] = vsubq_s32(step2[15].val[0], step2[12].val[0]);
+  step1[12].val[1] = vsubq_s32(step2[15].val[1], step2[12].val[1]);
+  step1[13].val[0] = vsubq_s32(step2[14].val[0], step2[13].val[0]);
+  step1[13].val[1] = vsubq_s32(step2[14].val[1], step2[13].val[1]);
+  step1[14].val[0] = vaddq_s32(step2[14].val[0], step2[13].val[0]);
+  step1[14].val[1] = vaddq_s32(step2[14].val[1], step2[13].val[1]);
+  step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[12].val[0]);
+  step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[12].val[1]);
+
+  // stage 6
+  step2[0].val[0] = vaddq_s32(step1[0].val[0], step1[7].val[0]);
+  step2[0].val[1] = vaddq_s32(step1[0].val[1], step1[7].val[1]);
+  step2[1].val[0] = vaddq_s32(step1[1].val[0], step1[6].val[0]);
+  step2[1].val[1] = vaddq_s32(step1[1].val[1], step1[6].val[1]);
+  step2[2].val[0] = vaddq_s32(step1[2].val[0], step1[5].val[0]);
+  step2[2].val[1] = vaddq_s32(step1[2].val[1], step1[5].val[1]);
+  step2[3].val[0] = vaddq_s32(step1[3].val[0], step1[4].val[0]);
+  step2[3].val[1] = vaddq_s32(step1[3].val[1], step1[4].val[1]);
+  step2[4].val[0] = vsubq_s32(step1[3].val[0], step1[4].val[0]);
+  step2[4].val[1] = vsubq_s32(step1[3].val[1], step1[4].val[1]);
+  step2[5].val[0] = vsubq_s32(step1[2].val[0], step1[5].val[0]);
+  step2[5].val[1] = vsubq_s32(step1[2].val[1], step1[5].val[1]);
+  step2[6].val[0] = vsubq_s32(step1[1].val[0], step1[6].val[0]);
+  step2[6].val[1] = vsubq_s32(step1[1].val[1], step1[6].val[1]);
+  step2[7].val[0] = vsubq_s32(step1[0].val[0], step1[7].val[0]);
+  step2[7].val[1] = vsubq_s32(step1[0].val[1], step1[7].val[1]);
+  highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                            &step2[13]);
+  highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                            &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  highbd_idct16x16_add_stage7_dual(step2, out);
+
+  if (output) {
+    highbd_idct16x16_store_pass1(out, output);
+  } else {
+    highbd_idct16x16_add_store(out, dest, stride, bd);
+  }
+}
+
+static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s,
+                                                       const int32x2_t coef) {
+  int64x2x2_t t[2];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 0);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0);
+  return highbd_idct16x16_add_wrap_low_8x1(t);
+}
+
+static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s,
+                                                const int32x2_t coef) {
+  int64x2x2_t t;
+
+  t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0);
+  t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0);
+  return highbd_idct16x16_add_wrap_low_4x1(t);
+}
+
+static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s,
+                                                       const int32x2_t coef) {
+  int64x2x2_t t[2];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 1);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1);
+  return highbd_idct16x16_add_wrap_low_8x1(t);
+}
+
+static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s,
+                                                const int32x2_t coef) {
+  int64x2x2_t t;
+
+  t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1);
+  t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1);
+  return highbd_idct16x16_add_wrap_low_4x1(t);
+}
+
+static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input,
+                                               int32_t *output, uint16_t *dest,
+                                               const int stride, const int bd) {
+  const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+  const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+  const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+  const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+  int32x4x2_t in[8], step1[16], step2[16], out[16];
+
+  // Load input (8x8)
+  in[0].val[0] = vld1q_s32(input);
+  in[0].val[1] = vld1q_s32(input + 4);
+  input += 16;
+  in[1].val[0] = vld1q_s32(input);
+  in[1].val[1] = vld1q_s32(input + 4);
+  input += 16;
+  in[2].val[0] = vld1q_s32(input);
+  in[2].val[1] = vld1q_s32(input + 4);
+  input += 16;
+  in[3].val[0] = vld1q_s32(input);
+  in[3].val[1] = vld1q_s32(input + 4);
+  input += 16;
+  in[4].val[0] = vld1q_s32(input);
+  in[4].val[1] = vld1q_s32(input + 4);
+  input += 16;
+  in[5].val[0] = vld1q_s32(input);
+  in[5].val[1] = vld1q_s32(input + 4);
+  input += 16;
+  in[6].val[0] = vld1q_s32(input);
+  in[6].val[1] = vld1q_s32(input + 4);
+  input += 16;
+  in[7].val[0] = vld1q_s32(input);
+  in[7].val[1] = vld1q_s32(input + 4);
+
+  // Transpose
+  transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[2] = in[8 / 2];
+  step1[4] = in[4 / 2];
+  step1[6] = in[12 / 2];
+  step1[8] = in[2 / 2];
+  step1[10] = in[10 / 2];
+  step1[12] = in[6 / 2];
+  step1[14] = in[14 / 2];  // 0 in pass 1
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[2] = step1[2];
+  step2[4] = step1[4];
+  step2[6] = step1[6];
+  step2[8] =
+      highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+  step2[9] = highbd_idct_cospi_lane1_dual(step1[14],
+                                          vget_high_s32(cospi_6_26N_14_18N));
+  step2[10] =
+      highbd_idct_cospi_lane1_dual(step1[10], vget_high_s32(cospi_2_30_10_22));
+  step2[11] =
+      highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+  step2[12] =
+      highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+  step2[13] =
+      highbd_idct_cospi_lane0_dual(step1[10], vget_high_s32(cospi_2_30_10_22));
+  step2[14] = highbd_idct_cospi_lane0_dual(step1[14],
+                                           vget_high_s32(cospi_6_26N_14_18N));
+  step2[15] =
+      highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+  step1[4] =
+      highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28));
+  step1[5] =
+      highbd_idct_cospi_lane0_dual(step2[6], vget_high_s32(cospi_4_12_20N_28));
+  step1[6] =
+      highbd_idct_cospi_lane1_dual(step2[6], vget_low_s32(cospi_4_12_20N_28));
+  step1[7] =
+      highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28));
+  step1[8] = highbd_idct_add_dual(step2[8], step2[9]);
+  step1[9] = highbd_idct_sub_dual(step2[8], step2[9]);
+  step1[10] = highbd_idct_sub_dual(step2[11], step2[10]);
+  step1[11] = highbd_idct_add_dual(step2[11], step2[10]);
+  step1[12] = highbd_idct_add_dual(step2[12], step2[13]);
+  step1[13] = highbd_idct_sub_dual(step2[12], step2[13]);
+  step1[14] = highbd_idct_sub_dual(step2[15], step2[14]);
+  step1[15] = highbd_idct_add_dual(step2[15], step2[14]);
+
+  // stage 4
+  step2[0] = step2[1] =
+      highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24));
+  step2[2] =
+      highbd_idct_cospi_lane1_dual(step1[2], vget_high_s32(cospi_0_8_16_24));
+  step2[3] =
+      highbd_idct_cospi_lane1_dual(step1[2], vget_low_s32(cospi_0_8_16_24));
+  step2[4] = highbd_idct_add_dual(step1[4], step1[5]);
+  step2[5] = highbd_idct_sub_dual(step1[4], step1[5]);
+  step2[6] = highbd_idct_sub_dual(step1[7], step1[6]);
+  step2[7] = highbd_idct_add_dual(step1[7], step1[6]);
+  step2[8] = step1[8];
+  highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                           &step2[14]);
+  highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+                               &step2[13], &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = highbd_idct_add_dual(step2[0], step2[3]);
+  step1[1] = highbd_idct_add_dual(step2[1], step2[2]);
+  step1[2] = highbd_idct_sub_dual(step2[1], step2[2]);
+  step1[3] = highbd_idct_sub_dual(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+                            &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = highbd_idct_add_dual(step2[8], step2[11]);
+  step1[9] = highbd_idct_add_dual(step2[9], step2[10]);
+  step1[10] = highbd_idct_sub_dual(step2[9], step2[10]);
+  step1[11] = highbd_idct_sub_dual(step2[8], step2[11]);
+  step1[12] = highbd_idct_sub_dual(step2[15], step2[12]);
+  step1[13] = highbd_idct_sub_dual(step2[14], step2[13]);
+  step1[14] = highbd_idct_add_dual(step2[14], step2[13]);
+  step1[15] = highbd_idct_add_dual(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = highbd_idct_add_dual(step1[0], step1[7]);
+  step2[1] = highbd_idct_add_dual(step1[1], step1[6]);
+  step2[2] = highbd_idct_add_dual(step1[2], step1[5]);
+  step2[3] = highbd_idct_add_dual(step1[3], step1[4]);
+  step2[4] = highbd_idct_sub_dual(step1[3], step1[4]);
+  step2[5] = highbd_idct_sub_dual(step1[2], step1[5]);
+  step2[6] = highbd_idct_sub_dual(step1[1], step1[6]);
+  step2[7] = highbd_idct_sub_dual(step1[0], step1[7]);
+  highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                            &step2[13]);
+  highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                            &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  highbd_idct16x16_add_stage7_dual(step2, out);
+
+  if (output) {
+    highbd_idct16x16_store_pass1(out, output);
+  } else {
+    highbd_idct16x16_add_store(out, dest, stride, bd);
+  }
+}
+
+void vpx_highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+                                              int32_t *output) {
+  const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+  const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+  const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+  const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+  int32x4_t in[4], step1[16], step2[16], out[16];
+
+  // Load input (4x4)
+  in[0] = vld1q_s32(input);
+  input += 16;
+  in[1] = vld1q_s32(input);
+  input += 16;
+  in[2] = vld1q_s32(input);
+  input += 16;
+  in[3] = vld1q_s32(input);
+
+  // Transpose
+  transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[4] = in[4 / 2];
+  step1[8] = in[2 / 2];
+  step1[12] = in[6 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[4] = step1[4];
+  step2[8] = highbd_idct_cospi_lane1(step1[8], vget_low_s32(cospi_2_30_10_22));
+  step2[11] =
+      highbd_idct_cospi_lane1(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+  step2[12] =
+      highbd_idct_cospi_lane0(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+  step2[15] = highbd_idct_cospi_lane0(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[4] =
+      highbd_idct_cospi_lane1(step2[4], vget_high_s32(cospi_4_12_20N_28));
+  step1[7] = highbd_idct_cospi_lane0(step2[4], vget_low_s32(cospi_4_12_20N_28));
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  step2[0] = step2[1] =
+      highbd_idct_cospi_lane0(step1[0], vget_high_s32(cospi_0_8_16_24));
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+  step2[8] = step1[8];
+  highbd_idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                           &step2[14]);
+  highbd_idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24,
+                               &step2[13], &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  highbd_idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+                            &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = vaddq_s32(step2[8], step2[11]);
+  step1[9] = vaddq_s32(step2[9], step2[10]);
+  step1[10] = vsubq_s32(step2[9], step2[10]);
+  step1[11] = vsubq_s32(step2[8], step2[11]);
+  step1[12] = vsubq_s32(step2[15], step2[12]);
+  step1[13] = vsubq_s32(step2[14], step2[13]);
+  step1[14] = vaddq_s32(step2[14], step2[13]);
+  step1[15] = vaddq_s32(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = vaddq_s32(step1[0], step1[7]);
+  step2[1] = vaddq_s32(step1[1], step1[6]);
+  step2[2] = vaddq_s32(step1[2], step1[5]);
+  step2[3] = vaddq_s32(step1[3], step1[4]);
+  step2[4] = vsubq_s32(step1[3], step1[4]);
+  step2[5] = vsubq_s32(step1[2], step1[5]);
+  step2[6] = vsubq_s32(step1[1], step1[6]);
+  step2[7] = vsubq_s32(step1[0], step1[7]);
+  highbd_idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                            &step2[13]);
+  highbd_idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                            &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  highbd_idct16x16_add_stage7(step2, out);
+
+  // pass 1: save the result into output
+  vst1q_s32(output, out[0]);
+  output += 4;
+  vst1q_s32(output, out[1]);
+  output += 4;
+  vst1q_s32(output, out[2]);
+  output += 4;
+  vst1q_s32(output, out[3]);
+  output += 4;
+  vst1q_s32(output, out[4]);
+  output += 4;
+  vst1q_s32(output, out[5]);
+  output += 4;
+  vst1q_s32(output, out[6]);
+  output += 4;
+  vst1q_s32(output, out[7]);
+  output += 4;
+  vst1q_s32(output, out[8]);
+  output += 4;
+  vst1q_s32(output, out[9]);
+  output += 4;
+  vst1q_s32(output, out[10]);
+  output += 4;
+  vst1q_s32(output, out[11]);
+  output += 4;
+  vst1q_s32(output, out[12]);
+  output += 4;
+  vst1q_s32(output, out[13]);
+  output += 4;
+  vst1q_s32(output, out[14]);
+  output += 4;
+  vst1q_s32(output, out[15]);
+}
+
+void vpx_highbd_idct16x16_10_add_half1d_pass2(const int32_t *input,
+                                              int32_t *const output,
+                                              uint16_t *const dest,
+                                              const int stride, const int bd) {
+  const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+  const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+  const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+  const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+  int32x4x2_t in[4], step1[16], step2[16], out[16];
+
+  // Load input (4x8)
+  in[0].val[0] = vld1q_s32(input);
+  input += 4;
+  in[0].val[1] = vld1q_s32(input);
+  input += 4;
+  in[1].val[0] = vld1q_s32(input);
+  input += 4;
+  in[1].val[1] = vld1q_s32(input);
+  input += 4;
+  in[2].val[0] = vld1q_s32(input);
+  input += 4;
+  in[2].val[1] = vld1q_s32(input);
+  input += 4;
+  in[3].val[0] = vld1q_s32(input);
+  input += 4;
+  in[3].val[1] = vld1q_s32(input);
+
+  // Transpose
+  transpose_s32_4x8(&in[0].val[0], &in[0].val[1], &in[1].val[0], &in[1].val[1],
+                    &in[2].val[0], &in[2].val[1], &in[3].val[0], &in[3].val[1]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[4] = in[4 / 2];
+  step1[8] = in[2 / 2];
+  step1[12] = in[6 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[4] = step1[4];
+  step2[8] =
+      highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+  step2[11] =
+      highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+  step2[12] =
+      highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+  step2[15] =
+      highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[4] =
+      highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28));
+  step1[7] =
+      highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28));
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  step2[0] = step2[1] =
+      highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24));
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+  step2[8] = step1[8];
+  highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                           &step2[14]);
+  highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+                               &step2[13], &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+                            &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = highbd_idct_add_dual(step2[8], step2[11]);
+  step1[9] = highbd_idct_add_dual(step2[9], step2[10]);
+  step1[10] = highbd_idct_sub_dual(step2[9], step2[10]);
+  step1[11] = highbd_idct_sub_dual(step2[8], step2[11]);
+  step1[12] = highbd_idct_sub_dual(step2[15], step2[12]);
+  step1[13] = highbd_idct_sub_dual(step2[14], step2[13]);
+  step1[14] = highbd_idct_add_dual(step2[14], step2[13]);
+  step1[15] = highbd_idct_add_dual(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = highbd_idct_add_dual(step1[0], step1[7]);
+  step2[1] = highbd_idct_add_dual(step1[1], step1[6]);
+  step2[2] = highbd_idct_add_dual(step1[2], step1[5]);
+  step2[3] = highbd_idct_add_dual(step1[3], step1[4]);
+  step2[4] = highbd_idct_sub_dual(step1[3], step1[4]);
+  step2[5] = highbd_idct_sub_dual(step1[2], step1[5]);
+  step2[6] = highbd_idct_sub_dual(step1[1], step1[6]);
+  step2[7] = highbd_idct_sub_dual(step1[0], step1[7]);
+  highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                            &step2[13]);
+  highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                            &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  highbd_idct16x16_add_stage7_dual(step2, out);
+
+  if (output) {
+    highbd_idct16x16_store_pass1(out, output);
+  } else {
+    highbd_idct16x16_add_store(out, dest, stride, bd);
+  }
+}
+
+void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest,
+                                       int stride, int bd) {
+  if (bd == 8) {
+    int16_t row_idct_output[16 * 16];
+
+    // pass 1
+    // Parallel idct on the upper 8 rows
+    vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 1);
+
+    // Parallel idct on the lower 8 rows
+    vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest,
+                                 stride, 1);
+
+    // pass 2
+    // Parallel idct to get the left 8 columns
+    vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 1);
+
+    // Parallel idct to get the right 8 columns
+    vpx_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, dest + 8,
+                                 stride, 1);
+  } else {
+    int32_t row_idct_output[16 * 16];
+
+    // pass 1
+    // Parallel idct on the upper 8 rows
+    vpx_highbd_idct16x16_256_add_half1d(input, row_idct_output, dest, stride,
+                                        bd);
+
+    // Parallel idct on the lower 8 rows
+    vpx_highbd_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8,
+                                        dest, stride, bd);
+
+    // pass 2
+    // Parallel idct to get the left 8 columns
+    vpx_highbd_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride,
+                                        bd);
+
+    // Parallel idct to get the right 8 columns
+    vpx_highbd_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL,
+                                        dest + 8, stride, bd);
+  }
+}
+
+void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  if (bd == 8) {
+    int16_t row_idct_output[16 * 16];
+
+    // pass 1
+    // Parallel idct on the upper 8 rows
+    vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 1);
+
+    // pass 2
+    // Parallel idct to get the left 8 columns
+    vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 1);
+
+    // Parallel idct to get the right 8 columns
+    vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8,
+                                stride, 1);
+  } else {
+    int32_t row_idct_output[16 * 16];
+
+    // pass 1
+    // Parallel idct on the upper 8 rows
+    vpx_highbd_idct16x16_38_add_half1d(input, row_idct_output, dest, stride,
+                                       bd);
+
+    // pass 2
+    // Parallel idct to get the left 8 columns
+    vpx_highbd_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, bd);
+
+    // Parallel idct to get the right 8 columns
+    vpx_highbd_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8,
+                                       stride, bd);
+  }
+}
+
+void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  if (bd == 8) {
+    int16_t row_idct_output[4 * 16];
+
+    // pass 1
+    // Parallel idct on the upper 8 rows
+    vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+    // pass 2
+    // Parallel idct to get the left 8 columns
+    vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 1);
+
+    // Parallel idct to get the right 8 columns
+    vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
+                                      stride, 1);
+  } else {
+    int32_t row_idct_output[4 * 16];
+
+    // pass 1
+    // Parallel idct on the upper 8 rows
+    vpx_highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+    // pass 2
+    // Parallel idct to get the left 8 columns
+    vpx_highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest,
+                                             stride, bd);
+
+    // Parallel idct to get the right 8 columns
+    vpx_highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL,
+                                             dest + 8, stride, bd);
+  }
+}
+
+static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res,
+                                                     const int16x8_t max) {
+  const uint16x8_t a0 = vld1q_u16(*dest + 0);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const int16x8_t c0 = vminq_s16(b0, max);
+  const int16x8_t c1 = vminq_s16(b1, max);
+  vst1q_u16(*dest + 0, vreinterpretq_u16_s16(c0));
+  vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+  *dest += stride;
+}
+
+static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res) {
+  const uint16x8_t a0 = vld1q_u16(*dest + 0);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+  const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+  vst1q_u16(*dest + 0, c0);
+  vst1q_u16(*dest + 8, c1);
+  *dest += stride;
+}
+
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int bd) {
+  const tran_low_t out0 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  const tran_low_t out1 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+  const int16x8_t dc = vdupq_n_s16(a1);
+  int i;
+
+  if (a1 >= 0) {
+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+    for (i = 0; i < 4; ++i) {
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+    }
+  } else {
+    for (i = 0; i < 4; ++i) {
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
new file mode 100644
index 0000000000..96a55c472f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
@@ -0,0 +1,646 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_from_transformed(const int32_t *const trans_buf,
+                                         const int first, const int second,
+                                         int32x4x2_t *const q0,
+                                         int32x4x2_t *const q1) {
+  q0->val[0] = vld1q_s32(trans_buf + first * 8);
+  q0->val[1] = vld1q_s32(trans_buf + first * 8 + 4);
+  q1->val[0] = vld1q_s32(trans_buf + second * 8);
+  q1->val[1] = vld1q_s32(trans_buf + second * 8 + 4);
+}
+
+static INLINE void load_from_output(const int32_t *const out, const int first,
+                                    const int second, int32x4x2_t *const q0,
+                                    int32x4x2_t *const q1) {
+  q0->val[0] = vld1q_s32(out + first * 32);
+  q0->val[1] = vld1q_s32(out + first * 32 + 4);
+  q1->val[0] = vld1q_s32(out + second * 32);
+  q1->val[1] = vld1q_s32(out + second * 32 + 4);
+}
+
+static INLINE void store_in_output(int32_t *const out, const int first,
+                                   const int second, const int32x4x2_t q0,
+                                   const int32x4x2_t q1) {
+  vst1q_s32(out + first * 32, q0.val[0]);
+  vst1q_s32(out + first * 32 + 4, q0.val[1]);
+  vst1q_s32(out + second * 32, q1.val[0]);
+  vst1q_s32(out + second * 32 + 4, q1.val[1]);
+}
+
+static INLINE void highbd_store_combine_results(
+    uint16_t *p1, uint16_t *p2, const int stride, const int32x4x2_t q0,
+    const int32x4x2_t q1, const int32x4x2_t q2, const int32x4x2_t q3,
+    const int16x8_t max) {
+  int16x8_t o[4];
+  uint16x8_t d[4];
+
+  d[0] = vld1q_u16(p1);
+  p1 += stride;
+  d[1] = vld1q_u16(p1);
+  d[3] = vld1q_u16(p2);
+  p2 -= stride;
+  d[2] = vld1q_u16(p2);
+
+  o[0] = vcombine_s16(vrshrn_n_s32(q0.val[0], 6), vrshrn_n_s32(q0.val[1], 6));
+  o[1] = vcombine_s16(vrshrn_n_s32(q1.val[0], 6), vrshrn_n_s32(q1.val[1], 6));
+  o[2] = vcombine_s16(vrshrn_n_s32(q2.val[0], 6), vrshrn_n_s32(q2.val[1], 6));
+  o[3] = vcombine_s16(vrshrn_n_s32(q3.val[0], 6), vrshrn_n_s32(q3.val[1], 6));
+
+  o[0] = vqaddq_s16(o[0], vreinterpretq_s16_u16(d[0]));
+  o[1] = vqaddq_s16(o[1], vreinterpretq_s16_u16(d[1]));
+  o[2] = vqaddq_s16(o[2], vreinterpretq_s16_u16(d[2]));
+  o[3] = vqaddq_s16(o[3], vreinterpretq_s16_u16(d[3]));
+  o[0] = vminq_s16(o[0], max);
+  o[1] = vminq_s16(o[1], max);
+  o[2] = vminq_s16(o[2], max);
+  o[3] = vminq_s16(o[3], max);
+  d[0] = vqshluq_n_s16(o[0], 0);
+  d[1] = vqshluq_n_s16(o[1], 0);
+  d[2] = vqshluq_n_s16(o[2], 0);
+  d[3] = vqshluq_n_s16(o[3], 0);
+
+  vst1q_u16(p1, d[1]);
+  p1 -= stride;
+  vst1q_u16(p1, d[0]);
+  vst1q_u16(p2, d[2]);
+  p2 += stride;
+  vst1q_u16(p2, d[3]);
+}
+
+static INLINE void do_butterfly(const int32x4x2_t qIn0, const int32x4x2_t qIn1,
+                                const int32_t first_const,
+                                const int32_t second_const,
+                                int32x4x2_t *const qOut0,
+                                int32x4x2_t *const qOut1) {
+  int64x2x2_t q[4];
+  int32x2_t d[6];
+
+  // Note: using v{mul, mla, mls}l_n_s32 here slows down 35% with gcc 4.9.
+  d[4] = vdup_n_s32(first_const);
+  d[5] = vdup_n_s32(second_const);
+
+  q[0].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[4]);
+  q[0].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[4]);
+  q[1].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[4]);
+  q[1].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[4]);
+  q[0].val[0] = vmlsl_s32(q[0].val[0], vget_low_s32(qIn1.val[0]), d[5]);
+  q[0].val[1] = vmlsl_s32(q[0].val[1], vget_high_s32(qIn1.val[0]), d[5]);
+  q[1].val[0] = vmlsl_s32(q[1].val[0], vget_low_s32(qIn1.val[1]), d[5]);
+  q[1].val[1] = vmlsl_s32(q[1].val[1], vget_high_s32(qIn1.val[1]), d[5]);
+
+  q[2].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[5]);
+  q[2].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[5]);
+  q[3].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[5]);
+  q[3].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[5]);
+  q[2].val[0] = vmlal_s32(q[2].val[0], vget_low_s32(qIn1.val[0]), d[4]);
+  q[2].val[1] = vmlal_s32(q[2].val[1], vget_high_s32(qIn1.val[0]), d[4]);
+  q[3].val[0] = vmlal_s32(q[3].val[0], vget_low_s32(qIn1.val[1]), d[4]);
+  q[3].val[1] = vmlal_s32(q[3].val[1], vget_high_s32(qIn1.val[1]), d[4]);
+
+  qOut0->val[0] = vcombine_s32(vrshrn_n_s64(q[0].val[0], DCT_CONST_BITS),
+                               vrshrn_n_s64(q[0].val[1], DCT_CONST_BITS));
+  qOut0->val[1] = vcombine_s32(vrshrn_n_s64(q[1].val[0], DCT_CONST_BITS),
+                               vrshrn_n_s64(q[1].val[1], DCT_CONST_BITS));
+  qOut1->val[0] = vcombine_s32(vrshrn_n_s64(q[2].val[0], DCT_CONST_BITS),
+                               vrshrn_n_s64(q[2].val[1], DCT_CONST_BITS));
+  qOut1->val[1] = vcombine_s32(vrshrn_n_s64(q[3].val[0], DCT_CONST_BITS),
+                               vrshrn_n_s64(q[3].val[1], DCT_CONST_BITS));
+}
+
+static INLINE void load_s32x4q_dual(
+    const int32_t *in, int32x4x2_t *const s0, int32x4x2_t *const s1,
+    int32x4x2_t *const s2, int32x4x2_t *const s3, int32x4x2_t *const s4,
+    int32x4x2_t *const s5, int32x4x2_t *const s6, int32x4x2_t *const s7) {
+  s0->val[0] = vld1q_s32(in);
+  s0->val[1] = vld1q_s32(in + 4);
+  in += 32;
+  s1->val[0] = vld1q_s32(in);
+  s1->val[1] = vld1q_s32(in + 4);
+  in += 32;
+  s2->val[0] = vld1q_s32(in);
+  s2->val[1] = vld1q_s32(in + 4);
+  in += 32;
+  s3->val[0] = vld1q_s32(in);
+  s3->val[1] = vld1q_s32(in + 4);
+  in += 32;
+  s4->val[0] = vld1q_s32(in);
+  s4->val[1] = vld1q_s32(in + 4);
+  in += 32;
+  s5->val[0] = vld1q_s32(in);
+  s5->val[1] = vld1q_s32(in + 4);
+  in += 32;
+  s6->val[0] = vld1q_s32(in);
+  s6->val[1] = vld1q_s32(in + 4);
+  in += 32;
+  s7->val[0] = vld1q_s32(in);
+  s7->val[1] = vld1q_s32(in + 4);
+}
+
+static INLINE void transpose_and_store_s32_8x8(int32x4x2_t a0, int32x4x2_t a1,
+                                               int32x4x2_t a2, int32x4x2_t a3,
+                                               int32x4x2_t a4, int32x4x2_t a5,
+                                               int32x4x2_t a6, int32x4x2_t a7,
+                                               int32_t **out) {
+  transpose_s32_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  vst1q_s32(*out, a0.val[0]);
+  *out += 4;
+  vst1q_s32(*out, a0.val[1]);
+  *out += 4;
+  vst1q_s32(*out, a1.val[0]);
+  *out += 4;
+  vst1q_s32(*out, a1.val[1]);
+  *out += 4;
+  vst1q_s32(*out, a2.val[0]);
+  *out += 4;
+  vst1q_s32(*out, a2.val[1]);
+  *out += 4;
+  vst1q_s32(*out, a3.val[0]);
+  *out += 4;
+  vst1q_s32(*out, a3.val[1]);
+  *out += 4;
+  vst1q_s32(*out, a4.val[0]);
+  *out += 4;
+  vst1q_s32(*out, a4.val[1]);
+  *out += 4;
+  vst1q_s32(*out, a5.val[0]);
+  *out += 4;
+  vst1q_s32(*out, a5.val[1]);
+  *out += 4;
+  vst1q_s32(*out, a6.val[0]);
+  *out += 4;
+  vst1q_s32(*out, a6.val[1]);
+  *out += 4;
+  vst1q_s32(*out, a7.val[0]);
+  *out += 4;
+  vst1q_s32(*out, a7.val[1]);
+  *out += 4;
+}
+
+static INLINE void idct32_transpose_pair(const int32_t *input, int32_t *t_buf) {
+  int i;
+  int32x4x2_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  for (i = 0; i < 4; i++, input += 8) {
+    load_s32x4q_dual(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+    transpose_and_store_s32_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
+  }
+}
+
+static INLINE void idct32_bands_end_1st_pass(int32_t *const out,
+                                             int32x4x2_t *const q) {
+  store_in_output(out, 16, 17, q[6], q[7]);
+  store_in_output(out, 14, 15, q[8], q[9]);
+
+  load_from_output(out, 30, 31, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  store_in_output(out, 30, 31, q[6], q[7]);
+  store_in_output(out, 0, 1, q[4], q[5]);
+
+  load_from_output(out, 12, 13, &q[0], &q[1]);
+  q[2] = highbd_idct_add_dual(q[10], q[1]);
+  q[3] = highbd_idct_add_dual(q[11], q[0]);
+  q[4] = highbd_idct_sub_dual(q[11], q[0]);
+  q[5] = highbd_idct_sub_dual(q[10], q[1]);
+
+  load_from_output(out, 18, 19, &q[0], &q[1]);
+  q[8] = highbd_idct_add_dual(q[4], q[1]);
+  q[9] = highbd_idct_add_dual(q[5], q[0]);
+  q[6] = highbd_idct_sub_dual(q[5], q[0]);
+  q[7] = highbd_idct_sub_dual(q[4], q[1]);
+  store_in_output(out, 18, 19, q[6], q[7]);
+  store_in_output(out, 12, 13, q[8], q[9]);
+
+  load_from_output(out, 28, 29, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  store_in_output(out, 28, 29, q[6], q[7]);
+  store_in_output(out, 2, 3, q[4], q[5]);
+
+  load_from_output(out, 10, 11, &q[0], &q[1]);
+  q[2] = highbd_idct_add_dual(q[12], q[1]);
+  q[3] = highbd_idct_add_dual(q[13], q[0]);
+  q[4] = highbd_idct_sub_dual(q[13], q[0]);
+  q[5] = highbd_idct_sub_dual(q[12], q[1]);
+
+  load_from_output(out, 20, 21, &q[0], &q[1]);
+  q[8] = highbd_idct_add_dual(q[4], q[1]);
+  q[9] = highbd_idct_add_dual(q[5], q[0]);
+  q[6] = highbd_idct_sub_dual(q[5], q[0]);
+  q[7] = highbd_idct_sub_dual(q[4], q[1]);
+  store_in_output(out, 20, 21, q[6], q[7]);
+  store_in_output(out, 10, 11, q[8], q[9]);
+
+  load_from_output(out, 26, 27, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  store_in_output(out, 26, 27, q[6], q[7]);
+  store_in_output(out, 4, 5, q[4], q[5]);
+
+  load_from_output(out, 8, 9, &q[0], &q[1]);
+  q[2] = highbd_idct_add_dual(q[14], q[1]);
+  q[3] = highbd_idct_add_dual(q[15], q[0]);
+  q[4] = highbd_idct_sub_dual(q[15], q[0]);
+  q[5] = highbd_idct_sub_dual(q[14], q[1]);
+
+  load_from_output(out, 22, 23, &q[0], &q[1]);
+  q[8] = highbd_idct_add_dual(q[4], q[1]);
+  q[9] = highbd_idct_add_dual(q[5], q[0]);
+  q[6] = highbd_idct_sub_dual(q[5], q[0]);
+  q[7] = highbd_idct_sub_dual(q[4], q[1]);
+  store_in_output(out, 22, 23, q[6], q[7]);
+  store_in_output(out, 8, 9, q[8], q[9]);
+
+  load_from_output(out, 24, 25, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  store_in_output(out, 24, 25, q[6], q[7]);
+  store_in_output(out, 6, 7, q[4], q[5]);
+}
+
+static INLINE void idct32_bands_end_2nd_pass(const int32_t *const out,
+                                             uint16_t *const dest,
+                                             const int stride,
+                                             const int16x8_t max,
+                                             int32x4x2_t *const q) {
+  uint16_t *dest0 = dest + 0 * stride;
+  uint16_t *dest1 = dest + 31 * stride;
+  uint16_t *dest2 = dest + 16 * stride;
+  uint16_t *dest3 = dest + 15 * stride;
+  const int str2 = stride << 1;
+
+  highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+                               max);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 30, 31, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+                               max);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 12, 13, &q[0], &q[1]);
+  q[2] = highbd_idct_add_dual(q[10], q[1]);
+  q[3] = highbd_idct_add_dual(q[11], q[0]);
+  q[4] = highbd_idct_sub_dual(q[11], q[0]);
+  q[5] = highbd_idct_sub_dual(q[10], q[1]);
+
+  load_from_output(out, 18, 19, &q[0], &q[1]);
+  q[8] = highbd_idct_add_dual(q[4], q[1]);
+  q[9] = highbd_idct_add_dual(q[5], q[0]);
+  q[6] = highbd_idct_sub_dual(q[5], q[0]);
+  q[7] = highbd_idct_sub_dual(q[4], q[1]);
+  highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+                               max);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 28, 29, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+                               max);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 10, 11, &q[0], &q[1]);
+  q[2] = highbd_idct_add_dual(q[12], q[1]);
+  q[3] = highbd_idct_add_dual(q[13], q[0]);
+  q[4] = highbd_idct_sub_dual(q[13], q[0]);
+  q[5] = highbd_idct_sub_dual(q[12], q[1]);
+
+  load_from_output(out, 20, 21, &q[0], &q[1]);
+  q[8] = highbd_idct_add_dual(q[4], q[1]);
+  q[9] = highbd_idct_add_dual(q[5], q[0]);
+  q[6] = highbd_idct_sub_dual(q[5], q[0]);
+  q[7] = highbd_idct_sub_dual(q[4], q[1]);
+  highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+                               max);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 26, 27, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+                               max);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 8, 9, &q[0], &q[1]);
+  q[2] = highbd_idct_add_dual(q[14], q[1]);
+  q[3] = highbd_idct_add_dual(q[15], q[0]);
+  q[4] = highbd_idct_sub_dual(q[15], q[0]);
+  q[5] = highbd_idct_sub_dual(q[14], q[1]);
+
+  load_from_output(out, 22, 23, &q[0], &q[1]);
+  q[8] = highbd_idct_add_dual(q[4], q[1]);
+  q[9] = highbd_idct_add_dual(q[5], q[0]);
+  q[6] = highbd_idct_sub_dual(q[5], q[0]);
+  q[7] = highbd_idct_sub_dual(q[4], q[1]);
+  highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+                               max);
+
+  load_from_output(out, 24, 25, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+                               max);
+}
+
+static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input,
+                                             uint16_t *dst, const int stride,
+                                             const int bd) {
+  int i, idct32_pass_loop;
+  int32_t trans_buf[32 * 8];
+  int32_t pass1[32 * 32];
+  int32_t pass2[32 * 32];
+  int32_t *out;
+  int32x4x2_t q[16];
+
+  for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
+       idct32_pass_loop++, input = pass1, out = pass2) {
+    for (i = 0; i < 4; i++, out += 8) {  // idct32_bands_loop
+      idct32_transpose_pair(input, trans_buf);
+      input += 32 * 8;
+
+      // -----------------------------------------
+      // BLOCK A: 16-19,28-31
+      // -----------------------------------------
+      // generate 16,17,30,31
+      // part of stage 1
+      load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]);
+      // part of stage 2
+      q[4] = highbd_idct_add_dual(q[0], q[1]);
+      q[13] = highbd_idct_sub_dual(q[0], q[1]);
+      q[6] = highbd_idct_add_dual(q[2], q[3]);
+      q[14] = highbd_idct_sub_dual(q[2], q[3]);
+      // part of stage 3
+      do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]);
+
+      // generate 18,19,28,29
+      // part of stage 1
+      load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]);
+      // part of stage 2
+      q[13] = highbd_idct_sub_dual(q[3], q[2]);
+      q[3] = highbd_idct_add_dual(q[3], q[2]);
+      q[14] = highbd_idct_sub_dual(q[1], q[0]);
+      q[2] = highbd_idct_add_dual(q[1], q[0]);
+      // part of stage 3
+      do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]);
+      // part of stage 4
+      q[8] = highbd_idct_add_dual(q[4], q[2]);
+      q[9] = highbd_idct_add_dual(q[5], q[0]);
+      q[10] = highbd_idct_add_dual(q[7], q[1]);
+      q[15] = highbd_idct_add_dual(q[6], q[3]);
+      q[13] = highbd_idct_sub_dual(q[5], q[0]);
+      q[14] = highbd_idct_sub_dual(q[7], q[1]);
+      store_in_output(out, 16, 31, q[8], q[15]);
+      store_in_output(out, 17, 30, q[9], q[10]);
+      // part of stage 5
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]);
+      store_in_output(out, 29, 18, q[1], q[0]);
+      // part of stage 4
+      q[13] = highbd_idct_sub_dual(q[4], q[2]);
+      q[14] = highbd_idct_sub_dual(q[6], q[3]);
+      // part of stage 5
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]);
+      store_in_output(out, 19, 28, q[4], q[6]);
+
+      // -----------------------------------------
+      // BLOCK B: 20-23,24-27
+      // -----------------------------------------
+      // generate 20,21,26,27
+      // part of stage 1
+      load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]);
+      // part of stage 2
+      q[13] = highbd_idct_sub_dual(q[0], q[1]);
+      q[0] = highbd_idct_add_dual(q[0], q[1]);
+      q[14] = highbd_idct_sub_dual(q[2], q[3]);
+      q[2] = highbd_idct_add_dual(q[2], q[3]);
+      // part of stage 3
+      do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
+
+      // generate 22,23,24,25
+      // part of stage 1
+      load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]);
+      load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]);
+      // part of stage 2
+      q[14] = highbd_idct_sub_dual(q[4], q[5]);
+      q[5] = highbd_idct_add_dual(q[4], q[5]);
+      q[13] = highbd_idct_sub_dual(q[6], q[7]);
+      q[6] = highbd_idct_add_dual(q[6], q[7]);
+      // part of stage 3
+      do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]);
+      // part of stage 4
+      q[10] = highbd_idct_add_dual(q[7], q[1]);
+      q[11] = highbd_idct_add_dual(q[5], q[0]);
+      q[12] = highbd_idct_add_dual(q[6], q[2]);
+      q[15] = highbd_idct_add_dual(q[4], q[3]);
+      // part of stage 6
+      load_from_output(out, 16, 17, &q[14], &q[13]);
+      q[8] = highbd_idct_add_dual(q[14], q[11]);
+      q[9] = highbd_idct_add_dual(q[13], q[10]);
+      q[13] = highbd_idct_sub_dual(q[13], q[10]);
+      q[11] = highbd_idct_sub_dual(q[14], q[11]);
+      store_in_output(out, 17, 16, q[9], q[8]);
+      load_from_output(out, 30, 31, &q[14], &q[9]);
+      q[8] = highbd_idct_sub_dual(q[9], q[12]);
+      q[10] = highbd_idct_add_dual(q[14], q[15]);
+      q[14] = highbd_idct_sub_dual(q[14], q[15]);
+      q[12] = highbd_idct_add_dual(q[9], q[12]);
+      store_in_output(out, 30, 31, q[10], q[12]);
+      // part of stage 7
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+      store_in_output(out, 25, 22, q[14], q[13]);
+      do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+      store_in_output(out, 24, 23, q[14], q[13]);
+      // part of stage 4
+      q[14] = highbd_idct_sub_dual(q[5], q[0]);
+      q[13] = highbd_idct_sub_dual(q[6], q[2]);
+      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]);
+      q[14] = highbd_idct_sub_dual(q[7], q[1]);
+      q[13] = highbd_idct_sub_dual(q[4], q[3]);
+      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]);
+      // part of stage 6
+      load_from_output(out, 18, 19, &q[14], &q[13]);
+      q[8] = highbd_idct_add_dual(q[14], q[1]);
+      q[9] = highbd_idct_add_dual(q[13], q[6]);
+      q[13] = highbd_idct_sub_dual(q[13], q[6]);
+      q[1] = highbd_idct_sub_dual(q[14], q[1]);
+      store_in_output(out, 18, 19, q[8], q[9]);
+      load_from_output(out, 28, 29, &q[8], &q[9]);
+      q[14] = highbd_idct_sub_dual(q[8], q[5]);
+      q[10] = highbd_idct_add_dual(q[8], q[5]);
+      q[11] = highbd_idct_add_dual(q[9], q[0]);
+      q[0] = highbd_idct_sub_dual(q[9], q[0]);
+      store_in_output(out, 28, 29, q[10], q[11]);
+      // part of stage 7
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+      store_in_output(out, 20, 27, q[13], q[14]);
+      do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]);
+      store_in_output(out, 21, 26, q[1], q[0]);
+
+      // -----------------------------------------
+      // BLOCK C: 8-10,11-15
+      // -----------------------------------------
+      // generate 8,9,14,15
+      // part of stage 2
+      load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]);
+      // part of stage 3
+      q[13] = highbd_idct_sub_dual(q[0], q[1]);
+      q[0] = highbd_idct_add_dual(q[0], q[1]);
+      q[14] = highbd_idct_sub_dual(q[2], q[3]);
+      q[2] = highbd_idct_add_dual(q[2], q[3]);
+      // part of stage 4
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]);
+
+      // generate 10,11,12,13
+      // part of stage 2
+      load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]);
+      load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]);
+      // part of stage 3
+      q[14] = highbd_idct_sub_dual(q[4], q[5]);
+      q[5] = highbd_idct_add_dual(q[4], q[5]);
+      q[13] = highbd_idct_sub_dual(q[6], q[7]);
+      q[6] = highbd_idct_add_dual(q[6], q[7]);
+      // part of stage 4
+      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]);
+      // part of stage 5
+      q[8] = highbd_idct_add_dual(q[0], q[5]);
+      q[9] = highbd_idct_add_dual(q[1], q[7]);
+      q[13] = highbd_idct_sub_dual(q[1], q[7]);
+      q[14] = highbd_idct_sub_dual(q[3], q[4]);
+      q[10] = highbd_idct_add_dual(q[3], q[4]);
+      q[15] = highbd_idct_add_dual(q[2], q[6]);
+      store_in_output(out, 8, 15, q[8], q[15]);
+      store_in_output(out, 9, 14, q[9], q[10]);
+      // part of stage 6
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+      store_in_output(out, 13, 10, q[3], q[1]);
+      q[13] = highbd_idct_sub_dual(q[0], q[5]);
+      q[14] = highbd_idct_sub_dual(q[2], q[6]);
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+      store_in_output(out, 11, 12, q[1], q[3]);
+
+      // -----------------------------------------
+      // BLOCK D: 0-3,4-7
+      // -----------------------------------------
+      // generate 4,5,6,7
+      // part of stage 3
+      load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
+      // part of stage 4
+      q[13] = highbd_idct_sub_dual(q[0], q[1]);
+      q[0] = highbd_idct_add_dual(q[0], q[1]);
+      q[14] = highbd_idct_sub_dual(q[2], q[3]);
+      q[2] = highbd_idct_add_dual(q[2], q[3]);
+      // part of stage 5
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+
+      // generate 0,1,2,3
+      // part of stage 4
+      load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]);
+      load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]);
+      // part of stage 5
+      q[4] = highbd_idct_add_dual(q[7], q[6]);
+      q[7] = highbd_idct_sub_dual(q[7], q[6]);
+      q[6] = highbd_idct_sub_dual(q[5], q[14]);
+      q[5] = highbd_idct_add_dual(q[5], q[14]);
+      // part of stage 6
+      q[8] = highbd_idct_add_dual(q[4], q[2]);
+      q[9] = highbd_idct_add_dual(q[5], q[3]);
+      q[10] = highbd_idct_add_dual(q[6], q[1]);
+      q[11] = highbd_idct_add_dual(q[7], q[0]);
+      q[12] = highbd_idct_sub_dual(q[7], q[0]);
+      q[13] = highbd_idct_sub_dual(q[6], q[1]);
+      q[14] = highbd_idct_sub_dual(q[5], q[3]);
+      q[15] = highbd_idct_sub_dual(q[4], q[2]);
+      // part of stage 7
+      load_from_output(out, 14, 15, &q[0], &q[1]);
+      q[2] = highbd_idct_add_dual(q[8], q[1]);
+      q[3] = highbd_idct_add_dual(q[9], q[0]);
+      q[4] = highbd_idct_sub_dual(q[9], q[0]);
+      q[5] = highbd_idct_sub_dual(q[8], q[1]);
+      load_from_output(out, 16, 17, &q[0], &q[1]);
+      q[8] = highbd_idct_add_dual(q[4], q[1]);
+      q[9] = highbd_idct_add_dual(q[5], q[0]);
+      q[6] = highbd_idct_sub_dual(q[5], q[0]);
+      q[7] = highbd_idct_sub_dual(q[4], q[1]);
+
+      if (idct32_pass_loop == 0) {
+        idct32_bands_end_1st_pass(out, q);
+      } else {
+        const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+        idct32_bands_end_2nd_pass(out, dst, stride, max, q);
+        dst += 8;
+      }
+    }
+  }
+}
+
+void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  if (bd == 8) {
+    vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1);
+  } else {
+    vpx_highbd_idct32_32_neon(input, dest, stride, bd);
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
new file mode 100644
index 0000000000..3970a5a861
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
@@ -0,0 +1,756 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_8x8_s32_dual(
+    const tran_low_t *input, int32x4x2_t *const in0, int32x4x2_t *const in1,
+    int32x4x2_t *const in2, int32x4x2_t *const in3, int32x4x2_t *const in4,
+    int32x4x2_t *const in5, int32x4x2_t *const in6, int32x4x2_t *const in7) {
+  in0->val[0] = vld1q_s32(input);
+  in0->val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in1->val[0] = vld1q_s32(input);
+  in1->val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in2->val[0] = vld1q_s32(input);
+  in2->val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in3->val[0] = vld1q_s32(input);
+  in3->val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in4->val[0] = vld1q_s32(input);
+  in4->val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in5->val[0] = vld1q_s32(input);
+  in5->val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in6->val[0] = vld1q_s32(input);
+  in6->val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in7->val[0] = vld1q_s32(input);
+  in7->val[1] = vld1q_s32(input + 4);
+}
+
+static INLINE void load_4x8_s32_dual(const tran_low_t *input,
+                                     int32x4_t *const in0, int32x4_t *const in1,
+                                     int32x4_t *const in2, int32x4_t *const in3,
+                                     int32x4_t *const in4, int32x4_t *const in5,
+                                     int32x4_t *const in6,
+                                     int32x4_t *const in7) {
+  *in0 = vld1q_s32(input);
+  input += 32;
+  *in1 = vld1q_s32(input);
+  input += 32;
+  *in2 = vld1q_s32(input);
+  input += 32;
+  *in3 = vld1q_s32(input);
+  input += 32;
+  *in4 = vld1q_s32(input);
+  input += 32;
+  *in5 = vld1q_s32(input);
+  input += 32;
+  *in6 = vld1q_s32(input);
+  input += 32;
+  *in7 = vld1q_s32(input);
+}
+
+// Only for the first pass of the  _135_ variant. Since it only uses values from
+// the top left 16x16 it can safely assume all the remaining values are 0 and
+// skip an awful lot of calculations. In fact, only the first 12 columns make
+// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are
+// used so it skips any calls to input[12|13|14|15] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 12x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero
+// coefficients as follows:
+//      0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+//  0   0   2   5  10  17  25  38  47  62  83 101 121
+//  1   1   4   8  15  22  30  45  58  74  92 112 133
+//  2   3   7  12  18  28  36  52  64  82 102 118
+//  3   6  11  16  23  31  43  60  73  90 109 126
+//  4   9  14  19  29  37  50  65  78  98 116 134
+//  5  13  20  26  35  44  54  72  85 105 123
+//  6  21  27  33  42  53  63  80  94 113 132
+//  7  24  32  39  48  57  71  88 104 120
+//  8  34  40  46  56  68  81  96 111 130
+//  9  41  49  55  67  77  91 107 124
+// 10  51  59  66  76  89  99 119 131
+// 11  61  69  75  87 100 114 129
+// 12  70  79  86  97 108 122
+// 13  84  93 103 110 125
+// 14  98 106 115 127
+// 15 117 128
+static void vpx_highbd_idct32_12_neon(const tran_low_t *const input,
+                                      int32_t *output) {
+  int32x4x2_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
+      s8[32];
+
+  load_8x8_s32_dual(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5],
+                    &in[6], &in[7]);
+  transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+
+  load_4x8_s32_dual(input + 8, &in[8].val[0], &in[8].val[1], &in[9].val[0],
+                    &in[9].val[1], &in[10].val[0], &in[10].val[1],
+                    &in[11].val[0], &in[11].val[1]);
+  transpose_s32_4x8(&in[8].val[0], &in[8].val[1], &in[9].val[0], &in[9].val[1],
+                    &in[10].val[0], &in[10].val[1], &in[11].val[0],
+                    &in[11].val[1]);
+
+  // stage 1
+  s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+  s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64);
+  s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64);
+
+  s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
+
+  s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+  s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64);
+  s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64);
+
+  s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+  // stage 2
+  s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+  s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64);
+  s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64);
+
+  s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
+
+  s2[18] = highbd_idct_sub_dual(s1[19], s1[18]);
+  s2[19] = highbd_idct_add_dual(s1[18], s1[19]);
+  s2[20] = highbd_idct_add_dual(s1[20], s1[21]);
+  s2[21] = highbd_idct_sub_dual(s1[20], s1[21]);
+  s2[26] = highbd_idct_sub_dual(s1[27], s1[26]);
+  s2[27] = highbd_idct_add_dual(s1[26], s1[27]);
+  s2[28] = highbd_idct_add_dual(s1[28], s1[29]);
+  s2[29] = highbd_idct_sub_dual(s1[28], s1[29]);
+
+  // stage 3
+  s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+  s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+  s3[10] = highbd_idct_sub_dual(s2[11], s2[10]);
+  s3[11] = highbd_idct_add_dual(s2[10], s2[11]);
+  s3[12] = highbd_idct_add_dual(s2[12], s2[13]);
+  s3[13] = highbd_idct_sub_dual(s2[12], s2[13]);
+
+  s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
+                                                         s1[31], cospi_28_64);
+  s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
+                                                         s1[31], cospi_4_64);
+
+  s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64,
+                                                         s2[29], -cospi_4_64);
+  s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64,
+                                                         s2[29], cospi_28_64);
+
+  s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64,
+                                                         s2[26], cospi_12_64);
+  s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64,
+                                                         s2[26], cospi_20_64);
+
+  s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
+                                                         s1[24], -cospi_20_64);
+  s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
+                                                         s1[24], cospi_12_64);
+
+  // stage 4
+  s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+  s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64);
+  s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64);
+
+  s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
+                                                        s2[15], cospi_24_64);
+  s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
+                                                         s2[15], cospi_8_64);
+
+  s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64,
+                                                         s3[13], -cospi_8_64);
+  s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64,
+                                                         s3[13], cospi_24_64);
+
+  s4[16] = highbd_idct_add_dual(s1[16], s2[19]);
+  s4[17] = highbd_idct_add_dual(s3[17], s3[18]);
+  s4[18] = highbd_idct_sub_dual(s3[17], s3[18]);
+  s4[19] = highbd_idct_sub_dual(s1[16], s2[19]);
+  s4[20] = highbd_idct_sub_dual(s1[23], s2[20]);
+  s4[21] = highbd_idct_sub_dual(s3[22], s3[21]);
+  s4[22] = highbd_idct_add_dual(s3[21], s3[22]);
+  s4[23] = highbd_idct_add_dual(s2[20], s1[23]);
+  s4[24] = highbd_idct_add_dual(s1[24], s2[27]);
+  s4[25] = highbd_idct_add_dual(s3[25], s3[26]);
+  s4[26] = highbd_idct_sub_dual(s3[25], s3[26]);
+  s4[27] = highbd_idct_sub_dual(s1[24], s2[27]);
+  s4[28] = highbd_idct_sub_dual(s1[31], s2[28]);
+  s4[29] = highbd_idct_sub_dual(s3[30], s3[29]);
+  s4[30] = highbd_idct_add_dual(s3[29], s3[30]);
+  s4[31] = highbd_idct_add_dual(s2[28], s1[31]);
+
+  // stage 5
+  s5[0] = highbd_idct_add_dual(s4[0], s4[3]);
+  s5[1] = highbd_idct_add_dual(s4[0], s4[2]);
+  s5[2] = highbd_idct_sub_dual(s4[0], s4[2]);
+  s5[3] = highbd_idct_sub_dual(s4[0], s4[3]);
+
+  s5[5] = sub_multiply_shift_and_narrow_s32_dual(s3[7], s3[4], cospi_16_64);
+  s5[6] = add_multiply_shift_and_narrow_s32_dual(s3[4], s3[7], cospi_16_64);
+
+  s5[8] = highbd_idct_add_dual(s2[8], s3[11]);
+  s5[9] = highbd_idct_add_dual(s4[9], s4[10]);
+  s5[10] = highbd_idct_sub_dual(s4[9], s4[10]);
+  s5[11] = highbd_idct_sub_dual(s2[8], s3[11]);
+  s5[12] = highbd_idct_sub_dual(s2[15], s3[12]);
+  s5[13] = highbd_idct_sub_dual(s4[14], s4[13]);
+  s5[14] = highbd_idct_add_dual(s4[13], s4[14]);
+  s5[15] = highbd_idct_add_dual(s2[15], s3[12]);
+
+  s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64,
+                                                         s4[29], cospi_24_64);
+  s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64,
+                                                         s4[29], cospi_8_64);
+
+  s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64,
+                                                         s4[28], cospi_24_64);
+  s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64,
+                                                         s4[28], cospi_8_64);
+
+  s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64,
+                                                         s4[27], -cospi_8_64);
+  s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64,
+                                                         s4[27], cospi_24_64);
+
+  s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64,
+                                                         s4[26], -cospi_8_64);
+  s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64,
+                                                         s4[26], cospi_24_64);
+
+  // stage 6
+  s6[0] = highbd_idct_add_dual(s5[0], s3[7]);
+  s6[1] = highbd_idct_add_dual(s5[1], s5[6]);
+  s6[2] = highbd_idct_add_dual(s5[2], s5[5]);
+  s6[3] = highbd_idct_add_dual(s5[3], s3[4]);
+  s6[4] = highbd_idct_sub_dual(s5[3], s3[4]);
+  s6[5] = highbd_idct_sub_dual(s5[2], s5[5]);
+  s6[6] = highbd_idct_sub_dual(s5[1], s5[6]);
+  s6[7] = highbd_idct_sub_dual(s5[0], s3[7]);
+
+  s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64);
+  s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64);
+
+  s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64);
+  s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64);
+
+  s6[16] = highbd_idct_add_dual(s4[16], s4[23]);
+  s6[17] = highbd_idct_add_dual(s4[17], s4[22]);
+  s6[18] = highbd_idct_add_dual(s5[18], s5[21]);
+  s6[19] = highbd_idct_add_dual(s5[19], s5[20]);
+  s6[20] = highbd_idct_sub_dual(s5[19], s5[20]);
+  s6[21] = highbd_idct_sub_dual(s5[18], s5[21]);
+  s6[22] = highbd_idct_sub_dual(s4[17], s4[22]);
+  s6[23] = highbd_idct_sub_dual(s4[16], s4[23]);
+
+  s6[24] = highbd_idct_sub_dual(s4[31], s4[24]);
+  s6[25] = highbd_idct_sub_dual(s4[30], s4[25]);
+  s6[26] = highbd_idct_sub_dual(s5[29], s5[26]);
+  s6[27] = highbd_idct_sub_dual(s5[28], s5[27]);
+  s6[28] = highbd_idct_add_dual(s5[27], s5[28]);
+  s6[29] = highbd_idct_add_dual(s5[26], s5[29]);
+  s6[30] = highbd_idct_add_dual(s4[25], s4[30]);
+  s6[31] = highbd_idct_add_dual(s4[24], s4[31]);
+
+  // stage 7
+  s7[0] = highbd_idct_add_dual(s6[0], s5[15]);
+  s7[1] = highbd_idct_add_dual(s6[1], s5[14]);
+  s7[2] = highbd_idct_add_dual(s6[2], s6[13]);
+  s7[3] = highbd_idct_add_dual(s6[3], s6[12]);
+  s7[4] = highbd_idct_add_dual(s6[4], s6[11]);
+  s7[5] = highbd_idct_add_dual(s6[5], s6[10]);
+  s7[6] = highbd_idct_add_dual(s6[6], s5[9]);
+  s7[7] = highbd_idct_add_dual(s6[7], s5[8]);
+  s7[8] = highbd_idct_sub_dual(s6[7], s5[8]);
+  s7[9] = highbd_idct_sub_dual(s6[6], s5[9]);
+  s7[10] = highbd_idct_sub_dual(s6[5], s6[10]);
+  s7[11] = highbd_idct_sub_dual(s6[4], s6[11]);
+  s7[12] = highbd_idct_sub_dual(s6[3], s6[12]);
+  s7[13] = highbd_idct_sub_dual(s6[2], s6[13]);
+  s7[14] = highbd_idct_sub_dual(s6[1], s5[14]);
+  s7[15] = highbd_idct_sub_dual(s6[0], s5[15]);
+
+  s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64);
+  s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64);
+
+  s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64);
+  s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64);
+
+  s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64);
+  s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64);
+
+  s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64);
+  s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64);
+
+  // final stage
+  s8[0] = highbd_idct_add_dual(s7[0], s6[31]);
+  s8[1] = highbd_idct_add_dual(s7[1], s6[30]);
+  s8[2] = highbd_idct_add_dual(s7[2], s6[29]);
+  s8[3] = highbd_idct_add_dual(s7[3], s6[28]);
+  s8[4] = highbd_idct_add_dual(s7[4], s7[27]);
+  s8[5] = highbd_idct_add_dual(s7[5], s7[26]);
+  s8[6] = highbd_idct_add_dual(s7[6], s7[25]);
+  s8[7] = highbd_idct_add_dual(s7[7], s7[24]);
+  s8[8] = highbd_idct_add_dual(s7[8], s7[23]);
+  s8[9] = highbd_idct_add_dual(s7[9], s7[22]);
+  s8[10] = highbd_idct_add_dual(s7[10], s7[21]);
+  s8[11] = highbd_idct_add_dual(s7[11], s7[20]);
+  s8[12] = highbd_idct_add_dual(s7[12], s6[19]);
+  s8[13] = highbd_idct_add_dual(s7[13], s6[18]);
+  s8[14] = highbd_idct_add_dual(s7[14], s6[17]);
+  s8[15] = highbd_idct_add_dual(s7[15], s6[16]);
+  s8[16] = highbd_idct_sub_dual(s7[15], s6[16]);
+  s8[17] = highbd_idct_sub_dual(s7[14], s6[17]);
+  s8[18] = highbd_idct_sub_dual(s7[13], s6[18]);
+  s8[19] = highbd_idct_sub_dual(s7[12], s6[19]);
+  s8[20] = highbd_idct_sub_dual(s7[11], s7[20]);
+  s8[21] = highbd_idct_sub_dual(s7[10], s7[21]);
+  s8[22] = highbd_idct_sub_dual(s7[9], s7[22]);
+  s8[23] = highbd_idct_sub_dual(s7[8], s7[23]);
+  s8[24] = highbd_idct_sub_dual(s7[7], s7[24]);
+  s8[25] = highbd_idct_sub_dual(s7[6], s7[25]);
+  s8[26] = highbd_idct_sub_dual(s7[5], s7[26]);
+  s8[27] = highbd_idct_sub_dual(s7[4], s7[27]);
+  s8[28] = highbd_idct_sub_dual(s7[3], s6[28]);
+  s8[29] = highbd_idct_sub_dual(s7[2], s6[29]);
+  s8[30] = highbd_idct_sub_dual(s7[1], s6[30]);
+  s8[31] = highbd_idct_sub_dual(s7[0], s6[31]);
+
+  vst1q_s32(output + 0, s8[0].val[0]);
+  vst1q_s32(output + 4, s8[0].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[1].val[0]);
+  vst1q_s32(output + 4, s8[1].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[2].val[0]);
+  vst1q_s32(output + 4, s8[2].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[3].val[0]);
+  vst1q_s32(output + 4, s8[3].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[4].val[0]);
+  vst1q_s32(output + 4, s8[4].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[5].val[0]);
+  vst1q_s32(output + 4, s8[5].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[6].val[0]);
+  vst1q_s32(output + 4, s8[6].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[7].val[0]);
+  vst1q_s32(output + 4, s8[7].val[1]);
+  output += 16;
+
+  vst1q_s32(output + 0, s8[8].val[0]);
+  vst1q_s32(output + 4, s8[8].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[9].val[0]);
+  vst1q_s32(output + 4, s8[9].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[10].val[0]);
+  vst1q_s32(output + 4, s8[10].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[11].val[0]);
+  vst1q_s32(output + 4, s8[11].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[12].val[0]);
+  vst1q_s32(output + 4, s8[12].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[13].val[0]);
+  vst1q_s32(output + 4, s8[13].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[14].val[0]);
+  vst1q_s32(output + 4, s8[14].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[15].val[0]);
+  vst1q_s32(output + 4, s8[15].val[1]);
+  output += 16;
+
+  vst1q_s32(output + 0, s8[16].val[0]);
+  vst1q_s32(output + 4, s8[16].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[17].val[0]);
+  vst1q_s32(output + 4, s8[17].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[18].val[0]);
+  vst1q_s32(output + 4, s8[18].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[19].val[0]);
+  vst1q_s32(output + 4, s8[19].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[20].val[0]);
+  vst1q_s32(output + 4, s8[20].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[21].val[0]);
+  vst1q_s32(output + 4, s8[21].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[22].val[0]);
+  vst1q_s32(output + 4, s8[22].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[23].val[0]);
+  vst1q_s32(output + 4, s8[23].val[1]);
+  output += 16;
+
+  vst1q_s32(output + 0, s8[24].val[0]);
+  vst1q_s32(output + 4, s8[24].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[25].val[0]);
+  vst1q_s32(output + 4, s8[25].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[26].val[0]);
+  vst1q_s32(output + 4, s8[26].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[27].val[0]);
+  vst1q_s32(output + 4, s8[27].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[28].val[0]);
+  vst1q_s32(output + 4, s8[28].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[29].val[0]);
+  vst1q_s32(output + 4, s8[29].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[30].val[0]);
+  vst1q_s32(output + 4, s8[30].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[31].val[0]);
+  vst1q_s32(output + 4, s8[31].val[1]);
+}
+
+static void vpx_highbd_idct32_16_neon(const int32_t *const input,
+                                      uint16_t *const output, const int stride,
+                                      const int bd) {
+  int32x4x2_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
+      out[32];
+
+  load_and_transpose_s32_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4],
+                             &in[5], &in[6], &in[7]);
+
+  load_and_transpose_s32_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11],
+                             &in[12], &in[13], &in[14], &in[15]);
+
+  // stage 1
+  s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+  s1[17] = multiply_shift_and_narrow_s32_dual(in[15], -cospi_17_64);
+  s1[30] = multiply_shift_and_narrow_s32_dual(in[15], cospi_15_64);
+
+  s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64);
+  s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64);
+
+  s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
+
+  s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+  s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64);
+  s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64);
+
+  s1[22] = multiply_shift_and_narrow_s32_dual(in[13], cospi_19_64);
+  s1[25] = multiply_shift_and_narrow_s32_dual(in[13], cospi_13_64);
+
+  s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+  // stage 2
+  s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+  s2[9] = multiply_shift_and_narrow_s32_dual(in[14], -cospi_18_64);
+  s2[14] = multiply_shift_and_narrow_s32_dual(in[14], cospi_14_64);
+
+  s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64);
+  s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64);
+
+  s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
+
+  s2[16] = highbd_idct_add_dual(s1[16], s1[17]);
+  s2[17] = highbd_idct_sub_dual(s1[16], s1[17]);
+  s2[18] = highbd_idct_sub_dual(s1[19], s1[18]);
+  s2[19] = highbd_idct_add_dual(s1[18], s1[19]);
+  s2[20] = highbd_idct_add_dual(s1[20], s1[21]);
+  s2[21] = highbd_idct_sub_dual(s1[20], s1[21]);
+  s2[22] = highbd_idct_sub_dual(s1[23], s1[22]);
+  s2[23] = highbd_idct_add_dual(s1[22], s1[23]);
+  s2[24] = highbd_idct_add_dual(s1[24], s1[25]);
+  s2[25] = highbd_idct_sub_dual(s1[24], s1[25]);
+  s2[26] = highbd_idct_sub_dual(s1[27], s1[26]);
+  s2[27] = highbd_idct_add_dual(s1[26], s1[27]);
+  s2[28] = highbd_idct_add_dual(s1[28], s1[29]);
+  s2[29] = highbd_idct_sub_dual(s1[28], s1[29]);
+  s2[30] = highbd_idct_sub_dual(s1[31], s1[30]);
+  s2[31] = highbd_idct_add_dual(s1[30], s1[31]);
+
+  // stage 3
+  s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+  s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+  s3[5] = multiply_shift_and_narrow_s32_dual(in[12], -cospi_20_64);
+  s3[6] = multiply_shift_and_narrow_s32_dual(in[12], cospi_12_64);
+
+  s3[8] = highbd_idct_add_dual(s2[8], s2[9]);
+  s3[9] = highbd_idct_sub_dual(s2[8], s2[9]);
+  s3[10] = highbd_idct_sub_dual(s2[11], s2[10]);
+  s3[11] = highbd_idct_add_dual(s2[10], s2[11]);
+  s3[12] = highbd_idct_add_dual(s2[12], s2[13]);
+  s3[13] = highbd_idct_sub_dual(s2[12], s2[13]);
+  s3[14] = highbd_idct_sub_dual(s2[15], s2[14]);
+  s3[15] = highbd_idct_add_dual(s2[14], s2[15]);
+
+  s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], -cospi_4_64,
+                                                         s2[30], cospi_28_64);
+  s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], cospi_28_64,
+                                                         s2[30], cospi_4_64);
+
+  s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64,
+                                                         s2[29], -cospi_4_64);
+  s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64,
+                                                         s2[29], cospi_28_64);
+
+  s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64,
+                                                         s2[26], cospi_12_64);
+  s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64,
+                                                         s2[26], cospi_20_64);
+
+  s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_12_64,
+                                                         s2[25], -cospi_20_64);
+  s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_20_64,
+                                                         s2[25], cospi_12_64);
+
+  // stage 4
+  s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+  s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64);
+  s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64);
+
+  s4[4] = highbd_idct_add_dual(s3[4], s3[5]);
+  s4[5] = highbd_idct_sub_dual(s3[4], s3[5]);
+  s4[6] = highbd_idct_sub_dual(s3[7], s3[6]);
+  s4[7] = highbd_idct_add_dual(s3[6], s3[7]);
+
+  s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], -cospi_8_64,
+                                                        s3[14], cospi_24_64);
+  s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], cospi_24_64,
+                                                         s3[14], cospi_8_64);
+
+  s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64,
+                                                         s3[13], -cospi_8_64);
+  s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64,
+                                                         s3[13], cospi_24_64);
+
+  s4[16] = highbd_idct_add_dual(s2[16], s2[19]);
+  s4[17] = highbd_idct_add_dual(s3[17], s3[18]);
+  s4[18] = highbd_idct_sub_dual(s3[17], s3[18]);
+  s4[19] = highbd_idct_sub_dual(s2[16], s2[19]);
+  s4[20] = highbd_idct_sub_dual(s2[23], s2[20]);
+  s4[21] = highbd_idct_sub_dual(s3[22], s3[21]);
+  s4[22] = highbd_idct_add_dual(s3[21], s3[22]);
+  s4[23] = highbd_idct_add_dual(s2[20], s2[23]);
+  s4[24] = highbd_idct_add_dual(s2[24], s2[27]);
+  s4[25] = highbd_idct_add_dual(s3[25], s3[26]);
+  s4[26] = highbd_idct_sub_dual(s3[25], s3[26]);
+  s4[27] = highbd_idct_sub_dual(s2[24], s2[27]);
+  s4[28] = highbd_idct_sub_dual(s2[31], s2[28]);
+  s4[29] = highbd_idct_sub_dual(s3[30], s3[29]);
+  s4[30] = highbd_idct_add_dual(s3[29], s3[30]);
+  s4[31] = highbd_idct_add_dual(s2[28], s2[31]);
+
+  // stage 5
+  s5[0] = highbd_idct_add_dual(s4[0], s4[3]);
+  s5[1] = highbd_idct_add_dual(s4[0], s4[2]);
+  s5[2] = highbd_idct_sub_dual(s4[0], s4[2]);
+  s5[3] = highbd_idct_sub_dual(s4[0], s4[3]);
+
+  s5[5] = sub_multiply_shift_and_narrow_s32_dual(s4[6], s4[5], cospi_16_64);
+  s5[6] = add_multiply_shift_and_narrow_s32_dual(s4[5], s4[6], cospi_16_64);
+
+  s5[8] = highbd_idct_add_dual(s3[8], s3[11]);
+  s5[9] = highbd_idct_add_dual(s4[9], s4[10]);
+  s5[10] = highbd_idct_sub_dual(s4[9], s4[10]);
+  s5[11] = highbd_idct_sub_dual(s3[8], s3[11]);
+  s5[12] = highbd_idct_sub_dual(s3[15], s3[12]);
+  s5[13] = highbd_idct_sub_dual(s4[14], s4[13]);
+  s5[14] = highbd_idct_add_dual(s4[13], s4[14]);
+  s5[15] = highbd_idct_add_dual(s3[15], s3[12]);
+
+  s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64,
+                                                         s4[29], cospi_24_64);
+  s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64,
+                                                         s4[29], cospi_8_64);
+
+  s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64,
+                                                         s4[28], cospi_24_64);
+  s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64,
+                                                         s4[28], cospi_8_64);
+
+  s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64,
+                                                         s4[27], -cospi_8_64);
+  s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64,
+                                                         s4[27], cospi_24_64);
+
+  s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64,
+                                                         s4[26], -cospi_8_64);
+  s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64,
+                                                         s4[26], cospi_24_64);
+
+  // stage 6
+  s6[0] = highbd_idct_add_dual(s5[0], s4[7]);
+  s6[1] = highbd_idct_add_dual(s5[1], s5[6]);
+  s6[2] = highbd_idct_add_dual(s5[2], s5[5]);
+  s6[3] = highbd_idct_add_dual(s5[3], s4[4]);
+  s6[4] = highbd_idct_sub_dual(s5[3], s4[4]);
+  s6[5] = highbd_idct_sub_dual(s5[2], s5[5]);
+  s6[6] = highbd_idct_sub_dual(s5[1], s5[6]);
+  s6[7] = highbd_idct_sub_dual(s5[0], s4[7]);
+
+  s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64);
+  s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64);
+
+  s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64);
+  s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64);
+
+  s6[16] = highbd_idct_add_dual(s4[16], s4[23]);
+  s6[17] = highbd_idct_add_dual(s4[17], s4[22]);
+  s6[18] = highbd_idct_add_dual(s5[18], s5[21]);
+  s6[19] = highbd_idct_add_dual(s5[19], s5[20]);
+  s6[20] = highbd_idct_sub_dual(s5[19], s5[20]);
+  s6[21] = highbd_idct_sub_dual(s5[18], s5[21]);
+  s6[22] = highbd_idct_sub_dual(s4[17], s4[22]);
+  s6[23] = highbd_idct_sub_dual(s4[16], s4[23]);
+  s6[24] = highbd_idct_sub_dual(s4[31], s4[24]);
+  s6[25] = highbd_idct_sub_dual(s4[30], s4[25]);
+  s6[26] = highbd_idct_sub_dual(s5[29], s5[26]);
+  s6[27] = highbd_idct_sub_dual(s5[28], s5[27]);
+  s6[28] = highbd_idct_add_dual(s5[27], s5[28]);
+  s6[29] = highbd_idct_add_dual(s5[26], s5[29]);
+  s6[30] = highbd_idct_add_dual(s4[25], s4[30]);
+  s6[31] = highbd_idct_add_dual(s4[24], s4[31]);
+
+  // stage 7
+  s7[0] = highbd_idct_add_dual(s6[0], s5[15]);
+  s7[1] = highbd_idct_add_dual(s6[1], s5[14]);
+  s7[2] = highbd_idct_add_dual(s6[2], s6[13]);
+  s7[3] = highbd_idct_add_dual(s6[3], s6[12]);
+  s7[4] = highbd_idct_add_dual(s6[4], s6[11]);
+  s7[5] = highbd_idct_add_dual(s6[5], s6[10]);
+  s7[6] = highbd_idct_add_dual(s6[6], s5[9]);
+  s7[7] = highbd_idct_add_dual(s6[7], s5[8]);
+  s7[8] = highbd_idct_sub_dual(s6[7], s5[8]);
+  s7[9] = highbd_idct_sub_dual(s6[6], s5[9]);
+  s7[10] = highbd_idct_sub_dual(s6[5], s6[10]);
+  s7[11] = highbd_idct_sub_dual(s6[4], s6[11]);
+  s7[12] = highbd_idct_sub_dual(s6[3], s6[12]);
+  s7[13] = highbd_idct_sub_dual(s6[2], s6[13]);
+  s7[14] = highbd_idct_sub_dual(s6[1], s5[14]);
+  s7[15] = highbd_idct_sub_dual(s6[0], s5[15]);
+
+  s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64);
+  s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64);
+
+  s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64);
+  s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64);
+
+  s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64);
+  s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64);
+
+  s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64);
+  s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64);
+
+  // final stage
+  out[0] = highbd_idct_add_dual(s7[0], s6[31]);
+  out[1] = highbd_idct_add_dual(s7[1], s6[30]);
+  out[2] = highbd_idct_add_dual(s7[2], s6[29]);
+  out[3] = highbd_idct_add_dual(s7[3], s6[28]);
+  out[4] = highbd_idct_add_dual(s7[4], s7[27]);
+  out[5] = highbd_idct_add_dual(s7[5], s7[26]);
+  out[6] = highbd_idct_add_dual(s7[6], s7[25]);
+  out[7] = highbd_idct_add_dual(s7[7], s7[24]);
+  out[8] = highbd_idct_add_dual(s7[8], s7[23]);
+  out[9] = highbd_idct_add_dual(s7[9], s7[22]);
+  out[10] = highbd_idct_add_dual(s7[10], s7[21]);
+  out[11] = highbd_idct_add_dual(s7[11], s7[20]);
+  out[12] = highbd_idct_add_dual(s7[12], s6[19]);
+  out[13] = highbd_idct_add_dual(s7[13], s6[18]);
+  out[14] = highbd_idct_add_dual(s7[14], s6[17]);
+  out[15] = highbd_idct_add_dual(s7[15], s6[16]);
+  out[16] = highbd_idct_sub_dual(s7[15], s6[16]);
+  out[17] = highbd_idct_sub_dual(s7[14], s6[17]);
+  out[18] = highbd_idct_sub_dual(s7[13], s6[18]);
+  out[19] = highbd_idct_sub_dual(s7[12], s6[19]);
+  out[20] = highbd_idct_sub_dual(s7[11], s7[20]);
+  out[21] = highbd_idct_sub_dual(s7[10], s7[21]);
+  out[22] = highbd_idct_sub_dual(s7[9], s7[22]);
+  out[23] = highbd_idct_sub_dual(s7[8], s7[23]);
+  out[24] = highbd_idct_sub_dual(s7[7], s7[24]);
+  out[25] = highbd_idct_sub_dual(s7[6], s7[25]);
+  out[26] = highbd_idct_sub_dual(s7[5], s7[26]);
+  out[27] = highbd_idct_sub_dual(s7[4], s7[27]);
+  out[28] = highbd_idct_sub_dual(s7[3], s6[28]);
+  out[29] = highbd_idct_sub_dual(s7[2], s6[29]);
+  out[30] = highbd_idct_sub_dual(s7[1], s6[30]);
+  out[31] = highbd_idct_sub_dual(s7[0], s6[31]);
+
+  highbd_idct16x16_add_store(out, output, stride, bd);
+  highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
+}
+
+void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest,
+                                       int stride, int bd) {
+  int i;
+
+  if (bd == 8) {
+    int16_t temp[32 * 16];
+    int16_t *t = temp;
+    vpx_idct32_12_neon(input, temp);
+    vpx_idct32_12_neon(input + 32 * 8, temp + 8);
+
+    for (i = 0; i < 32; i += 8) {
+      vpx_idct32_16_neon(t, dest, stride, 1);
+      t += (16 * 8);
+      dest += 8;
+    }
+  } else {
+    int32_t temp[32 * 16];
+    int32_t *t = temp;
+    vpx_highbd_idct32_12_neon(input, temp);
+    vpx_highbd_idct32_12_neon(input + 32 * 8, temp + 8);
+
+    for (i = 0; i < 32; i += 8) {
+      vpx_highbd_idct32_16_neon(t, dest, stride, bd);
+      t += (16 * 8);
+      dest += 8;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
new file mode 100644
index 0000000000..5d9063b15d
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
@@ -0,0 +1,624 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+// Only for the first pass of the  _34_ variant. Since it only uses values from
+// the top left 8x8 it can safely assume all the remaining values are 0 and skip
+// an awful lot of calculations. In fact, only the first 6 columns make the cut.
+// None of the elements in the 7th or 8th column are used so it skips any calls
+// to input[67] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 8x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero
+// coefficients as follows:
+//    0  1  2  3  4  5  6  7
+// 0  0  2  5 10 17 25
+// 1  1  4  8 15 22 30
+// 2  3  7 12 18 28
+// 3  6 11 16 23 31
+// 4  9 14 19 29
+// 5 13 20 26
+// 6 21 27 33
+// 7 24 32
+static void vpx_highbd_idct32_6_neon(const tran_low_t *input, int32_t *output) {
+  int32x4x2_t in[8], s1[32], s2[32], s3[32];
+
+  in[0].val[0] = vld1q_s32(input);
+  in[0].val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in[1].val[0] = vld1q_s32(input);
+  in[1].val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in[2].val[0] = vld1q_s32(input);
+  in[2].val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in[3].val[0] = vld1q_s32(input);
+  in[3].val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in[4].val[0] = vld1q_s32(input);
+  in[4].val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in[5].val[0] = vld1q_s32(input);
+  in[5].val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in[6].val[0] = vld1q_s32(input);
+  in[6].val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in[7].val[0] = vld1q_s32(input);
+  in[7].val[1] = vld1q_s32(input + 4);
+  transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+
+  // stage 1
+  // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
+  s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+  // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
+  s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+  s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+  s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+  // stage 2
+  s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+  // stage 3
+  s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+  s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+  s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
+                                                         s1[31], cospi_28_64);
+  s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
+                                                         s1[31], cospi_4_64);
+
+  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64,
+                                                         s1[27], cospi_12_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64,
+                                                         s1[27], cospi_20_64);
+
+  s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
+                                                         s1[24], -cospi_20_64);
+  s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
+                                                         s1[24], cospi_12_64);
+
+  // stage 4
+  s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+
+  s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
+                                                        s2[15], cospi_24_64);
+  s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
+                                                         s2[15], cospi_8_64);
+
+  s2[20] = highbd_idct_sub_dual(s1[23], s1[20]);
+  s2[21] = highbd_idct_sub_dual(s1[22], s1[21]);
+  s2[22] = highbd_idct_add_dual(s1[21], s1[22]);
+  s2[23] = highbd_idct_add_dual(s1[20], s1[23]);
+  s2[24] = highbd_idct_add_dual(s1[24], s1[27]);
+  s2[25] = highbd_idct_add_dual(s1[25], s1[26]);
+  s2[26] = highbd_idct_sub_dual(s1[25], s1[26]);
+  s2[27] = highbd_idct_sub_dual(s1[24], s1[27]);
+
+  // stage 5
+  s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64);
+  s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64);
+
+  s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], -cospi_8_64,
+                                                         s1[30], cospi_24_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], cospi_24_64,
+                                                         s1[30], cospi_8_64);
+
+  s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_8_64,
+                                                         s1[31], cospi_24_64);
+  s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_24_64,
+                                                         s1[31], cospi_8_64);
+
+  s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64,
+                                                         s2[27], -cospi_8_64);
+  s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64,
+                                                         s2[27], cospi_24_64);
+
+  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64,
+                                                         s2[26], -cospi_8_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64,
+                                                         s2[26], cospi_24_64);
+
+  // stage 6
+  s2[0] = highbd_idct_add_dual(s1[0], s1[7]);
+  s2[1] = highbd_idct_add_dual(s1[0], s1[6]);
+  s2[2] = highbd_idct_add_dual(s1[0], s1[5]);
+  s2[3] = highbd_idct_add_dual(s1[0], s1[4]);
+  s2[4] = highbd_idct_sub_dual(s1[0], s1[4]);
+  s2[5] = highbd_idct_sub_dual(s1[0], s1[5]);
+  s2[6] = highbd_idct_sub_dual(s1[0], s1[6]);
+  s2[7] = highbd_idct_sub_dual(s1[0], s1[7]);
+
+  s2[10] = sub_multiply_shift_and_narrow_s32_dual(s2[14], s2[9], cospi_16_64);
+  s2[13] = add_multiply_shift_and_narrow_s32_dual(s2[9], s2[14], cospi_16_64);
+
+  s2[11] = sub_multiply_shift_and_narrow_s32_dual(s2[15], s2[8], cospi_16_64);
+  s2[12] = add_multiply_shift_and_narrow_s32_dual(s2[8], s2[15], cospi_16_64);
+
+  s2[16] = highbd_idct_add_dual(s1[16], s2[23]);
+  s2[17] = highbd_idct_add_dual(s1[17], s2[22]);
+  s2[18] = highbd_idct_add_dual(s1[18], s1[21]);
+  s2[19] = highbd_idct_add_dual(s1[19], s1[20]);
+  s2[20] = highbd_idct_sub_dual(s1[19], s1[20]);
+  s2[21] = highbd_idct_sub_dual(s1[18], s1[21]);
+  s2[22] = highbd_idct_sub_dual(s1[17], s2[22]);
+  s2[23] = highbd_idct_sub_dual(s1[16], s2[23]);
+
+  s3[24] = highbd_idct_sub_dual(s1[31], s2[24]);
+  s3[25] = highbd_idct_sub_dual(s1[30], s2[25]);
+  s3[26] = highbd_idct_sub_dual(s1[29], s1[26]);
+  s3[27] = highbd_idct_sub_dual(s1[28], s1[27]);
+  s2[28] = highbd_idct_add_dual(s1[27], s1[28]);
+  s2[29] = highbd_idct_add_dual(s1[26], s1[29]);
+  s2[30] = highbd_idct_add_dual(s2[25], s1[30]);
+  s2[31] = highbd_idct_add_dual(s2[24], s1[31]);
+
+  // stage 7
+  s1[0] = highbd_idct_add_dual(s2[0], s2[15]);
+  s1[1] = highbd_idct_add_dual(s2[1], s2[14]);
+  s1[2] = highbd_idct_add_dual(s2[2], s2[13]);
+  s1[3] = highbd_idct_add_dual(s2[3], s2[12]);
+  s1[4] = highbd_idct_add_dual(s2[4], s2[11]);
+  s1[5] = highbd_idct_add_dual(s2[5], s2[10]);
+  s1[6] = highbd_idct_add_dual(s2[6], s2[9]);
+  s1[7] = highbd_idct_add_dual(s2[7], s2[8]);
+  s1[8] = highbd_idct_sub_dual(s2[7], s2[8]);
+  s1[9] = highbd_idct_sub_dual(s2[6], s2[9]);
+  s1[10] = highbd_idct_sub_dual(s2[5], s2[10]);
+  s1[11] = highbd_idct_sub_dual(s2[4], s2[11]);
+  s1[12] = highbd_idct_sub_dual(s2[3], s2[12]);
+  s1[13] = highbd_idct_sub_dual(s2[2], s2[13]);
+  s1[14] = highbd_idct_sub_dual(s2[1], s2[14]);
+  s1[15] = highbd_idct_sub_dual(s2[0], s2[15]);
+
+  s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64);
+  s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64);
+
+  s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64);
+  s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64);
+
+  s1[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s2[22], cospi_16_64);
+  s1[25] = add_multiply_shift_and_narrow_s32_dual(s2[22], s3[25], cospi_16_64);
+
+  s1[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s2[23], cospi_16_64);
+  s1[24] = add_multiply_shift_and_narrow_s32_dual(s2[23], s3[24], cospi_16_64);
+
+  // final stage
+  s3[0] = highbd_idct_add_dual(s1[0], s2[31]);
+  s3[1] = highbd_idct_add_dual(s1[1], s2[30]);
+  s3[2] = highbd_idct_add_dual(s1[2], s2[29]);
+  s3[3] = highbd_idct_add_dual(s1[3], s2[28]);
+  s3[4] = highbd_idct_add_dual(s1[4], s1[27]);
+  s3[5] = highbd_idct_add_dual(s1[5], s1[26]);
+  s3[6] = highbd_idct_add_dual(s1[6], s1[25]);
+  s3[7] = highbd_idct_add_dual(s1[7], s1[24]);
+  s3[8] = highbd_idct_add_dual(s1[8], s1[23]);
+  s3[9] = highbd_idct_add_dual(s1[9], s1[22]);
+  s3[10] = highbd_idct_add_dual(s1[10], s1[21]);
+  s3[11] = highbd_idct_add_dual(s1[11], s1[20]);
+  s3[12] = highbd_idct_add_dual(s1[12], s2[19]);
+  s3[13] = highbd_idct_add_dual(s1[13], s2[18]);
+  s3[14] = highbd_idct_add_dual(s1[14], s2[17]);
+  s3[15] = highbd_idct_add_dual(s1[15], s2[16]);
+  s3[16] = highbd_idct_sub_dual(s1[15], s2[16]);
+  s3[17] = highbd_idct_sub_dual(s1[14], s2[17]);
+  s3[18] = highbd_idct_sub_dual(s1[13], s2[18]);
+  s3[19] = highbd_idct_sub_dual(s1[12], s2[19]);
+  s3[20] = highbd_idct_sub_dual(s1[11], s1[20]);
+  s3[21] = highbd_idct_sub_dual(s1[10], s1[21]);
+  s3[22] = highbd_idct_sub_dual(s1[9], s1[22]);
+  s3[23] = highbd_idct_sub_dual(s1[8], s1[23]);
+  s3[24] = highbd_idct_sub_dual(s1[7], s1[24]);
+  s3[25] = highbd_idct_sub_dual(s1[6], s1[25]);
+  s3[26] = highbd_idct_sub_dual(s1[5], s1[26]);
+  s3[27] = highbd_idct_sub_dual(s1[4], s1[27]);
+  s3[28] = highbd_idct_sub_dual(s1[3], s2[28]);
+  s3[29] = highbd_idct_sub_dual(s1[2], s2[29]);
+  s3[30] = highbd_idct_sub_dual(s1[1], s2[30]);
+  s3[31] = highbd_idct_sub_dual(s1[0], s2[31]);
+
+  vst1q_s32(output, s3[0].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[0].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[1].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[1].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[2].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[2].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[3].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[3].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[4].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[4].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[5].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[5].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[6].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[6].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[7].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[7].val[1]);
+  output += 4;
+
+  vst1q_s32(output, s3[8].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[8].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[9].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[9].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[10].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[10].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[11].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[11].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[12].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[12].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[13].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[13].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[14].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[14].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[15].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[15].val[1]);
+  output += 4;
+
+  vst1q_s32(output, s3[16].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[16].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[17].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[17].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[18].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[18].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[19].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[19].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[20].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[20].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[21].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[21].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[22].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[22].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[23].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[23].val[1]);
+  output += 4;
+
+  vst1q_s32(output, s3[24].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[24].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[25].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[25].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[26].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[26].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[27].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[27].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[28].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[28].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[29].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[29].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[30].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[30].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[31].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[31].val[1]);
+}
+
+static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output,
+                                     int stride, const int bd) {
+  int32x4x2_t in[8], s1[32], s2[32], s3[32], out[32];
+
+  load_and_transpose_s32_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4],
+                             &in[5], &in[6], &in[7]);
+
+  // stage 1
+  s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+  // Different for _8_
+  s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
+
+  s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+  s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+  // stage 2
+  s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+  s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
+
+  // stage 3
+  s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+  s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+  s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
+                                                         s1[31], cospi_28_64);
+  s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
+                                                         s1[31], cospi_4_64);
+
+  // Different for _8_
+  s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_28_64,
+                                                         s1[28], -cospi_4_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_4_64,
+                                                         s1[28], cospi_28_64);
+
+  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64,
+                                                         s1[27], cospi_12_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64,
+                                                         s1[27], cospi_20_64);
+
+  s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
+                                                         s1[24], -cospi_20_64);
+  s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
+                                                         s1[24], cospi_12_64);
+
+  // stage 4
+  s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+
+  s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
+                                                        s2[15], cospi_24_64);
+  s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
+                                                         s2[15], cospi_8_64);
+
+  s2[10] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_24_64,
+                                                         s2[12], -cospi_8_64);
+  s2[13] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_8_64,
+                                                         s2[12], cospi_24_64);
+
+  s2[16] = highbd_idct_add_dual(s1[16], s1[19]);
+
+  s2[17] = highbd_idct_add_dual(s1[17], s1[18]);
+  s2[18] = highbd_idct_sub_dual(s1[17], s1[18]);
+
+  s2[19] = highbd_idct_sub_dual(s1[16], s1[19]);
+
+  s2[20] = highbd_idct_sub_dual(s1[23], s1[20]);
+  s2[21] = highbd_idct_sub_dual(s1[22], s1[21]);
+
+  s2[22] = highbd_idct_add_dual(s1[21], s1[22]);
+  s2[23] = highbd_idct_add_dual(s1[20], s1[23]);
+
+  s2[24] = highbd_idct_add_dual(s1[24], s1[27]);
+  s2[25] = highbd_idct_add_dual(s1[25], s1[26]);
+  s2[26] = highbd_idct_sub_dual(s1[25], s1[26]);
+  s2[27] = highbd_idct_sub_dual(s1[24], s1[27]);
+
+  s2[28] = highbd_idct_sub_dual(s1[31], s1[28]);
+  s2[29] = highbd_idct_sub_dual(s1[30], s1[29]);
+  s2[30] = highbd_idct_add_dual(s1[29], s1[30]);
+  s2[31] = highbd_idct_add_dual(s1[28], s1[31]);
+
+  // stage 5
+  s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64);
+  s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64);
+
+  s1[8] = highbd_idct_add_dual(s2[8], s2[11]);
+  s1[9] = highbd_idct_add_dual(s2[9], s2[10]);
+  s1[10] = highbd_idct_sub_dual(s2[9], s2[10]);
+  s1[11] = highbd_idct_sub_dual(s2[8], s2[11]);
+  s1[12] = highbd_idct_sub_dual(s2[15], s2[12]);
+  s1[13] = highbd_idct_sub_dual(s2[14], s2[13]);
+  s1[14] = highbd_idct_add_dual(s2[13], s2[14]);
+  s1[15] = highbd_idct_add_dual(s2[12], s2[15]);
+
+  s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_8_64,
+                                                         s2[29], cospi_24_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], cospi_24_64,
+                                                         s2[29], cospi_8_64);
+
+  s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], -cospi_8_64,
+                                                         s2[28], cospi_24_64);
+  s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], cospi_24_64,
+                                                         s2[28], cospi_8_64);
+
+  s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64,
+                                                         s2[27], -cospi_8_64);
+  s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64,
+                                                         s2[27], cospi_24_64);
+
+  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64,
+                                                         s2[26], -cospi_8_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64,
+                                                         s2[26], cospi_24_64);
+
+  // stage 6
+  s2[0] = highbd_idct_add_dual(s1[0], s1[7]);
+  s2[1] = highbd_idct_add_dual(s1[0], s1[6]);
+  s2[2] = highbd_idct_add_dual(s1[0], s1[5]);
+  s2[3] = highbd_idct_add_dual(s1[0], s1[4]);
+  s2[4] = highbd_idct_sub_dual(s1[0], s1[4]);
+  s2[5] = highbd_idct_sub_dual(s1[0], s1[5]);
+  s2[6] = highbd_idct_sub_dual(s1[0], s1[6]);
+  s2[7] = highbd_idct_sub_dual(s1[0], s1[7]);
+
+  s2[10] = sub_multiply_shift_and_narrow_s32_dual(s1[13], s1[10], cospi_16_64);
+  s2[13] = add_multiply_shift_and_narrow_s32_dual(s1[10], s1[13], cospi_16_64);
+
+  s2[11] = sub_multiply_shift_and_narrow_s32_dual(s1[12], s1[11], cospi_16_64);
+  s2[12] = add_multiply_shift_and_narrow_s32_dual(s1[11], s1[12], cospi_16_64);
+
+  s1[16] = highbd_idct_add_dual(s2[16], s2[23]);
+  s1[17] = highbd_idct_add_dual(s2[17], s2[22]);
+  s2[18] = highbd_idct_add_dual(s1[18], s1[21]);
+  s2[19] = highbd_idct_add_dual(s1[19], s1[20]);
+  s2[20] = highbd_idct_sub_dual(s1[19], s1[20]);
+  s2[21] = highbd_idct_sub_dual(s1[18], s1[21]);
+  s1[22] = highbd_idct_sub_dual(s2[17], s2[22]);
+  s1[23] = highbd_idct_sub_dual(s2[16], s2[23]);
+
+  s3[24] = highbd_idct_sub_dual(s2[31], s2[24]);
+  s3[25] = highbd_idct_sub_dual(s2[30], s2[25]);
+  s3[26] = highbd_idct_sub_dual(s1[29], s1[26]);
+  s3[27] = highbd_idct_sub_dual(s1[28], s1[27]);
+  s2[28] = highbd_idct_add_dual(s1[27], s1[28]);
+  s2[29] = highbd_idct_add_dual(s1[26], s1[29]);
+  s2[30] = highbd_idct_add_dual(s2[25], s2[30]);
+  s2[31] = highbd_idct_add_dual(s2[24], s2[31]);
+
+  // stage 7
+  s1[0] = highbd_idct_add_dual(s2[0], s1[15]);
+  s1[1] = highbd_idct_add_dual(s2[1], s1[14]);
+  s1[2] = highbd_idct_add_dual(s2[2], s2[13]);
+  s1[3] = highbd_idct_add_dual(s2[3], s2[12]);
+  s1[4] = highbd_idct_add_dual(s2[4], s2[11]);
+  s1[5] = highbd_idct_add_dual(s2[5], s2[10]);
+  s1[6] = highbd_idct_add_dual(s2[6], s1[9]);
+  s1[7] = highbd_idct_add_dual(s2[7], s1[8]);
+  s1[8] = highbd_idct_sub_dual(s2[7], s1[8]);
+  s1[9] = highbd_idct_sub_dual(s2[6], s1[9]);
+  s1[10] = highbd_idct_sub_dual(s2[5], s2[10]);
+  s1[11] = highbd_idct_sub_dual(s2[4], s2[11]);
+  s1[12] = highbd_idct_sub_dual(s2[3], s2[12]);
+  s1[13] = highbd_idct_sub_dual(s2[2], s2[13]);
+  s1[14] = highbd_idct_sub_dual(s2[1], s1[14]);
+  s1[15] = highbd_idct_sub_dual(s2[0], s1[15]);
+
+  s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64);
+  s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64);
+
+  s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64);
+  s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64);
+
+  s2[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s1[22], cospi_16_64);
+  s1[25] = add_multiply_shift_and_narrow_s32_dual(s1[22], s3[25], cospi_16_64);
+
+  s2[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s1[23], cospi_16_64);
+  s1[24] = add_multiply_shift_and_narrow_s32_dual(s1[23], s3[24], cospi_16_64);
+
+  // final stage
+  out[0] = highbd_idct_add_dual(s1[0], s2[31]);
+  out[1] = highbd_idct_add_dual(s1[1], s2[30]);
+  out[2] = highbd_idct_add_dual(s1[2], s2[29]);
+  out[3] = highbd_idct_add_dual(s1[3], s2[28]);
+  out[4] = highbd_idct_add_dual(s1[4], s1[27]);
+  out[5] = highbd_idct_add_dual(s1[5], s1[26]);
+  out[6] = highbd_idct_add_dual(s1[6], s1[25]);
+  out[7] = highbd_idct_add_dual(s1[7], s1[24]);
+  out[8] = highbd_idct_add_dual(s1[8], s2[23]);
+  out[9] = highbd_idct_add_dual(s1[9], s2[22]);
+  out[10] = highbd_idct_add_dual(s1[10], s1[21]);
+  out[11] = highbd_idct_add_dual(s1[11], s1[20]);
+  out[12] = highbd_idct_add_dual(s1[12], s2[19]);
+  out[13] = highbd_idct_add_dual(s1[13], s2[18]);
+  out[14] = highbd_idct_add_dual(s1[14], s1[17]);
+  out[15] = highbd_idct_add_dual(s1[15], s1[16]);
+  out[16] = highbd_idct_sub_dual(s1[15], s1[16]);
+  out[17] = highbd_idct_sub_dual(s1[14], s1[17]);
+  out[18] = highbd_idct_sub_dual(s1[13], s2[18]);
+  out[19] = highbd_idct_sub_dual(s1[12], s2[19]);
+  out[20] = highbd_idct_sub_dual(s1[11], s1[20]);
+  out[21] = highbd_idct_sub_dual(s1[10], s1[21]);
+  out[22] = highbd_idct_sub_dual(s1[9], s2[22]);
+  out[23] = highbd_idct_sub_dual(s1[8], s2[23]);
+  out[24] = highbd_idct_sub_dual(s1[7], s1[24]);
+  out[25] = highbd_idct_sub_dual(s1[6], s1[25]);
+  out[26] = highbd_idct_sub_dual(s1[5], s1[26]);
+  out[27] = highbd_idct_sub_dual(s1[4], s1[27]);
+  out[28] = highbd_idct_sub_dual(s1[3], s2[28]);
+  out[29] = highbd_idct_sub_dual(s1[2], s2[29]);
+  out[30] = highbd_idct_sub_dual(s1[1], s2[30]);
+  out[31] = highbd_idct_sub_dual(s1[0], s2[31]);
+
+  highbd_idct16x16_add_store(out, output, stride, bd);
+  highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
+}
+
+void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  int i;
+
+  if (bd == 8) {
+    int16_t temp[32 * 8];
+    int16_t *t = temp;
+
+    vpx_idct32_6_neon(input, t);
+
+    for (i = 0; i < 32; i += 8) {
+      vpx_idct32_8_neon(t, dest, stride, 1);
+      t += (8 * 8);
+      dest += 8;
+    }
+  } else {
+    int32_t temp[32 * 8];
+    int32_t *t = temp;
+
+    vpx_highbd_idct32_6_neon(input, t);
+
+    for (i = 0; i < 32; i += 8) {
+      vpx_highbd_idct32_8_neon(t, dest, stride, bd);
+      t += (8 * 8);
+      dest += 8;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
new file mode 100644
index 0000000000..c1354c0c1a
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
@@ -0,0 +1,88 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res,
+                                                     const int16x8_t max) {
+  const uint16x8_t a0 = vld1q_u16(*dest);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const uint16x8_t a2 = vld1q_u16(*dest + 16);
+  const uint16x8_t a3 = vld1q_u16(*dest + 24);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+  const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+  const int16x8_t c0 = vminq_s16(b0, max);
+  const int16x8_t c1 = vminq_s16(b1, max);
+  const int16x8_t c2 = vminq_s16(b2, max);
+  const int16x8_t c3 = vminq_s16(b3, max);
+  vst1q_u16(*dest, vreinterpretq_u16_s16(c0));
+  vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+  vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2));
+  vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3));
+  *dest += stride;
+}
+
+static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res) {
+  const uint16x8_t a0 = vld1q_u16(*dest);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const uint16x8_t a2 = vld1q_u16(*dest + 16);
+  const uint16x8_t a3 = vld1q_u16(*dest + 24);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+  const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+  const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+  const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+  const uint16x8_t c2 = vqshluq_n_s16(b2, 0);
+  const uint16x8_t c3 = vqshluq_n_s16(b3, 0);
+  vst1q_u16(*dest, c0);
+  vst1q_u16(*dest + 8, c1);
+  vst1q_u16(*dest + 16, c2);
+  vst1q_u16(*dest + 24, c3);
+  *dest += stride;
+}
+
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int bd) {
+  const tran_low_t out0 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  const tran_low_t out1 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+  const int16x8_t dc = vdupq_n_s16(a1);
+  int i;
+
+  if (a1 >= 0) {
+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+    for (i = 0; i < 8; ++i) {
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+    }
+  } else {
+    for (i = 0; i < 8; ++i) {
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
new file mode 100644
index 0000000000..1418a75a15
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -0,0 +1,173 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest,
+                                                const int stride,
+                                                const int16x8_t res,
+                                                const int16x8_t max) {
+  const uint16x4_t a0 = vld1_u16(*dest);
+  const uint16x4_t a1 = vld1_u16(*dest + stride);
+  const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1));
+  // Note: In some profile tests, res is quite close to +/-32767.
+  // We use saturating addition.
+  const int16x8_t b = vqaddq_s16(res, a);
+  const int16x8_t c = vminq_s16(b, max);
+  const uint16x8_t d = vqshluq_n_s16(c, 0);
+  vst1_u16(*dest, vget_low_u16(d));
+  *dest += stride;
+  vst1_u16(*dest, vget_high_u16(d));
+  *dest += stride;
+}
+
+// res is in reverse row order
+static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
+                                                const int stride,
+                                                const int16x8_t res,
+                                                const int16x8_t max) {
+  const uint16x4_t a0 = vld1_u16(*dest);
+  const uint16x4_t a1 = vld1_u16(*dest + stride);
+  const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a1, a0));
+  // Note: In some profile tests, res is quite close to +/-32767.
+  // We use saturating addition.
+  const int16x8_t b = vqaddq_s16(res, a);
+  const int16x8_t c = vminq_s16(b, max);
+  const uint16x8_t d = vqshluq_n_s16(c, 0);
+  vst1_u16(*dest, vget_high_u16(d));
+  *dest += stride;
+  vst1_u16(*dest, vget_low_u16(d));
+  *dest += stride;
+}
+
+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  const tran_low_t out0 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  const tran_low_t out1 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
+  const int16x8_t dc = vdupq_n_s16(a1);
+
+  highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
+  highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
+}
+
+static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
+                                          int32x4_t *const a0,
+                                          int32x4_t *const a1,
+                                          int32x4_t *const a2,
+                                          int32x4_t *const a3) {
+  int32x4_t b0, b1, b2, b3;
+
+  transpose_s32_4x4(a0, a1, a2, a3);
+  b0 = vaddq_s32(*a0, *a2);
+  b1 = vsubq_s32(*a0, *a2);
+  b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
+  b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
+  b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1);
+  b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);
+  b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);
+  b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);
+  b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
+  b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
+  b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
+  b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
+  *a0 = vaddq_s32(b0, b3);
+  *a1 = vaddq_s32(b1, b2);
+  *a2 = vsubq_s32(b1, b2);
+  *a3 = vsubq_s32(b0, b3);
+}
+
+static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
+                                          int32x4_t *const a0,
+                                          int32x4_t *const a1,
+                                          int32x4_t *const a2,
+                                          int32x4_t *const a3) {
+  int32x4_t b0, b1, b2, b3;
+  int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11;
+
+  transpose_s32_4x4(a0, a1, a2, a3);
+  b0 = vaddq_s32(*a0, *a2);
+  b1 = vsubq_s32(*a0, *a2);
+  c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
+  c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
+  c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
+  c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
+  c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1);
+  c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1);
+  c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1);
+  c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1);
+  c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1);
+  c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1);
+  c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1);
+  c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1);
+  c4 = vsubq_s64(c4, c8);
+  c5 = vsubq_s64(c5, c9);
+  c6 = vaddq_s64(c6, c10);
+  c7 = vaddq_s64(c7, c11);
+  b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS),
+                    vrshrn_n_s64(c1, DCT_CONST_BITS));
+  b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS),
+                    vrshrn_n_s64(c3, DCT_CONST_BITS));
+  b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS),
+                    vrshrn_n_s64(c5, DCT_CONST_BITS));
+  b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS),
+                    vrshrn_n_s64(c7, DCT_CONST_BITS));
+  *a0 = vaddq_s32(b0, b3);
+  *a1 = vaddq_s32(b1, b2);
+  *a2 = vsubq_s32(b1, b2);
+  *a3 = vsubq_s32(b0, b3);
+}
+
+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
+                                    int stride, int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  int32x4_t c0 = vld1q_s32(input);
+  int32x4_t c1 = vld1q_s32(input + 4);
+  int32x4_t c2 = vld1q_s32(input + 8);
+  int32x4_t c3 = vld1q_s32(input + 12);
+  int16x8_t a0, a1;
+
+  if (bd == 8) {
+    const int16x4_t cospis = vld1_s16(kCospi);
+
+    // Rows
+    a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1));
+    a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3));
+    idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+
+    // Columns
+    a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
+    idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+    a0 = vrshrq_n_s16(a0, 4);
+    a1 = vrshrq_n_s16(a1, 4);
+  } else {
+    const int32x4_t cospis = vld1q_s32(kCospi32);
+
+    if (bd == 10) {
+      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
+      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
+    } else {
+      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
+      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
+    }
+    a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4));
+    a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4));
+  }
+
+  highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max);
+  highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max);
+}
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
new file mode 100644
index 0000000000..dd90134a6e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -0,0 +1,631 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest,
+                                                   const int stride,
+                                                   const int16x8_t res,
+                                                   const int16x8_t max) {
+  const uint16x8_t a = vld1q_u16(*dest);
+  const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
+  const int16x8_t c = vminq_s16(b, max);
+  vst1q_u16(*dest, vreinterpretq_u16_s16(c));
+  *dest += stride;
+}
+
+static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
+                                                   const int stride,
+                                                   const int16x8_t res) {
+  const uint16x8_t a = vld1q_u16(*dest);
+  const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
+  const uint16x8_t c = vqshluq_n_s16(b, 0);
+  vst1q_u16(*dest, c);
+  *dest += stride;
+}
+
+void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int bd) {
+  const tran_low_t out0 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  const tran_low_t out1 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
+  const int16x8_t dc = vdupq_n_s16(a1);
+
+  if (a1 >= 0) {
+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+  } else {
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+  }
+}
+
+static INLINE void idct8x8_12_half1d_bd10(
+    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+    int32x4_t *const io7) {
+  int32x4_t step1[8], step2[8];
+
+  transpose_s32_4x4(io0, io1, io2, io3);
+
+  // stage 1
+  step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
+  step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
+  step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
+  step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
+  step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+  step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
+
+  // stage 2
+  step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
+  step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
+  step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
+  step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+  step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+  step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
+
+  step2[4] = vaddq_s32(step1[4], step1[5]);
+  step2[5] = vsubq_s32(step1[4], step1[5]);
+  step2[6] = vsubq_s32(step1[7], step1[6]);
+  step2[7] = vaddq_s32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s32(step2[1], step2[3]);
+  step1[1] = vaddq_s32(step2[1], step2[2]);
+  step1[2] = vsubq_s32(step2[1], step2[2]);
+  step1[3] = vsubq_s32(step2[1], step2[3]);
+
+  step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
+  step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+  step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+
+  // stage 4
+  *io0 = vaddq_s32(step1[0], step2[7]);
+  *io1 = vaddq_s32(step1[1], step1[6]);
+  *io2 = vaddq_s32(step1[2], step1[5]);
+  *io3 = vaddq_s32(step1[3], step2[4]);
+  *io4 = vsubq_s32(step1[3], step2[4]);
+  *io5 = vsubq_s32(step1[2], step1[5]);
+  *io6 = vsubq_s32(step1[1], step1[6]);
+  *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_half1d_bd12(
+    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+    int32x4_t *const io7) {
+  int32x2_t input_1l, input_1h, input_3l, input_3h;
+  int32x2_t step1l[2], step1h[2];
+  int32x4_t step1[8], step2[8];
+  int64x2_t t64[8];
+  int32x2_t t32[8];
+
+  transpose_s32_4x4(io0, io1, io2, io3);
+
+  // stage 1
+  input_1l = vget_low_s32(*io1);
+  input_1h = vget_high_s32(*io1);
+  input_3l = vget_low_s32(*io3);
+  input_3h = vget_high_s32(*io3);
+  step1l[0] = vget_low_s32(*io0);
+  step1h[0] = vget_high_s32(*io0);
+  step1l[1] = vget_low_s32(*io2);
+  step1h[1] = vget_high_s32(*io2);
+
+  t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
+  t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
+  t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
+  t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
+  t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
+  t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
+  t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
+  t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+  step1[4] = vcombine_s32(t32[0], t32[1]);
+  step1[5] = vcombine_s32(t32[2], t32[3]);
+  step1[6] = vcombine_s32(t32[4], t32[5]);
+  step1[7] = vcombine_s32(t32[6], t32[7]);
+
+  // stage 2
+  t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
+  t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
+  t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
+  t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
+  t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
+  t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+  step2[1] = vcombine_s32(t32[2], t32[3]);
+  step2[2] = vcombine_s32(t32[4], t32[5]);
+  step2[3] = vcombine_s32(t32[6], t32[7]);
+
+  step2[4] = vaddq_s32(step1[4], step1[5]);
+  step2[5] = vsubq_s32(step1[4], step1[5]);
+  step2[6] = vsubq_s32(step1[7], step1[6]);
+  step2[7] = vaddq_s32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s32(step2[1], step2[3]);
+  step1[1] = vaddq_s32(step2[1], step2[2]);
+  step1[2] = vsubq_s32(step2[1], step2[2]);
+  step1[3] = vsubq_s32(step2[1], step2[3]);
+
+  t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
+  t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
+  t64[0] =
+      vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+  t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
+                          vget_high_s32(cospis0), 0);
+  t64[2] =
+      vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+  t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
+                          vget_high_s32(cospis0), 0);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  step1[5] = vcombine_s32(t32[0], t32[1]);
+  step1[6] = vcombine_s32(t32[2], t32[3]);
+
+  // stage 4
+  *io0 = vaddq_s32(step1[0], step2[7]);
+  *io1 = vaddq_s32(step1[1], step1[6]);
+  *io2 = vaddq_s32(step1[2], step1[5]);
+  *io3 = vaddq_s32(step1[3], step2[4]);
+  *io4 = vsubq_s32(step1[3], step2[4]);
+  *io5 = vsubq_s32(step1[2], step1[5]);
+  *io6 = vsubq_s32(step1[1], step1[6]);
+  *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
+                                 int16x8_t a3, int16x8_t a4, int16x8_t a5,
+                                 int16x8_t a6, int16x8_t a7, uint16_t *dest,
+                                 const int stride, const int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  const uint16_t *dst = dest;
+  uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+  uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
+  int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16;
+
+  d0 = vld1q_u16(dst);
+  dst += stride;
+  d1 = vld1q_u16(dst);
+  dst += stride;
+  d2 = vld1q_u16(dst);
+  dst += stride;
+  d3 = vld1q_u16(dst);
+  dst += stride;
+  d4 = vld1q_u16(dst);
+  dst += stride;
+  d5 = vld1q_u16(dst);
+  dst += stride;
+  d6 = vld1q_u16(dst);
+  dst += stride;
+  d7 = vld1q_u16(dst);
+
+  d0_s16 = vqaddq_s16(a0, vreinterpretq_s16_u16(d0));
+  d1_s16 = vqaddq_s16(a1, vreinterpretq_s16_u16(d1));
+  d2_s16 = vqaddq_s16(a2, vreinterpretq_s16_u16(d2));
+  d3_s16 = vqaddq_s16(a3, vreinterpretq_s16_u16(d3));
+  d4_s16 = vqaddq_s16(a4, vreinterpretq_s16_u16(d4));
+  d5_s16 = vqaddq_s16(a5, vreinterpretq_s16_u16(d5));
+  d6_s16 = vqaddq_s16(a6, vreinterpretq_s16_u16(d6));
+  d7_s16 = vqaddq_s16(a7, vreinterpretq_s16_u16(d7));
+
+  d0_s16 = vminq_s16(d0_s16, max);
+  d1_s16 = vminq_s16(d1_s16, max);
+  d2_s16 = vminq_s16(d2_s16, max);
+  d3_s16 = vminq_s16(d3_s16, max);
+  d4_s16 = vminq_s16(d4_s16, max);
+  d5_s16 = vminq_s16(d5_s16, max);
+  d6_s16 = vminq_s16(d6_s16, max);
+  d7_s16 = vminq_s16(d7_s16, max);
+  d0_u16 = vqshluq_n_s16(d0_s16, 0);
+  d1_u16 = vqshluq_n_s16(d1_s16, 0);
+  d2_u16 = vqshluq_n_s16(d2_s16, 0);
+  d3_u16 = vqshluq_n_s16(d3_s16, 0);
+  d4_u16 = vqshluq_n_s16(d4_s16, 0);
+  d5_u16 = vqshluq_n_s16(d5_s16, 0);
+  d6_u16 = vqshluq_n_s16(d6_s16, 0);
+  d7_u16 = vqshluq_n_s16(d7_s16, 0);
+
+  vst1q_u16(dest, d0_u16);
+  dest += stride;
+  vst1q_u16(dest, d1_u16);
+  dest += stride;
+  vst1q_u16(dest, d2_u16);
+  dest += stride;
+  vst1q_u16(dest, d3_u16);
+  dest += stride;
+  vst1q_u16(dest, d4_u16);
+  dest += stride;
+  vst1q_u16(dest, d5_u16);
+  dest += stride;
+  vst1q_u16(dest, d6_u16);
+  dest += stride;
+  vst1q_u16(dest, d7_u16);
+}
+
+void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest,
+                                    int stride, int bd) {
+  int32x4_t a0 = vld1q_s32(input);
+  int32x4_t a1 = vld1q_s32(input + 8);
+  int32x4_t a2 = vld1q_s32(input + 16);
+  int32x4_t a3 = vld1q_s32(input + 24);
+  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+
+  if (bd == 8) {
+    const int16x8_t cospis = vld1q_s16(kCospi);
+    const int16x8_t cospisd = vaddq_s16(cospis, cospis);
+    const int16x4_t cospis0 = vget_low_s16(cospis);     // cospi 0, 8, 16, 24
+    const int16x4_t cospisd0 = vget_low_s16(cospisd);   // doubled 0, 8, 16, 24
+    const int16x4_t cospisd1 = vget_high_s16(cospisd);  // doubled 4, 12, 20, 28
+    int16x4_t b0 = vmovn_s32(a0);
+    int16x4_t b1 = vmovn_s32(a1);
+    int16x4_t b2 = vmovn_s32(a2);
+    int16x4_t b3 = vmovn_s32(a3);
+    int16x4_t b4, b5, b6, b7;
+
+    idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &b0, &b1, &b2, &b3, &b4,
+                         &b5, &b6, &b7);
+    idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b0, b1, b2, b3, b4, b5,
+                         b6, b7, &c0, &c1, &c2, &c3, &c4, &c5, &c6, &c7);
+    c0 = vrshrq_n_s16(c0, 5);
+    c1 = vrshrq_n_s16(c1, 5);
+    c2 = vrshrq_n_s16(c2, 5);
+    c3 = vrshrq_n_s16(c3, 5);
+    c4 = vrshrq_n_s16(c4, 5);
+    c5 = vrshrq_n_s16(c5, 5);
+    c6 = vrshrq_n_s16(c6, 5);
+    c7 = vrshrq_n_s16(c7, 5);
+  } else {
+    const int32x4_t cospis0 = vld1q_s32(kCospi32);      // cospi 0, 8, 16, 24
+    const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
+    int32x4_t a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15;
+
+    if (bd == 10) {
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
+                             &a6, &a7);
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
+                             &a10, &a11);
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
+                             &a14, &a15);
+    } else {
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
+                             &a6, &a7);
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
+                             &a10, &a11);
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
+                             &a14, &a15);
+    }
+    c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
+    c1 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
+    c2 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
+    c3 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
+    c4 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
+    c5 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
+    c6 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
+    c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
+  }
+  highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
+}
+
+static INLINE void idct8x8_64_half1d_bd10(
+    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+    int32x4_t *const io7) {
+  int32x4_t step1[8], step2[8];
+
+  transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
+
+  // stage 1
+  step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
+  step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
+  step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
+  step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
+
+  step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0);
+  step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1);
+  step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
+  step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
+
+  step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+  step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
+
+  // stage 2
+  step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
+  step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
+  step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
+
+  step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
+  step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
+  step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
+  step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
+
+  step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS);
+  step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+  step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+  step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
+
+  step2[4] = vaddq_s32(step1[4], step1[5]);
+  step2[5] = vsubq_s32(step1[4], step1[5]);
+  step2[6] = vsubq_s32(step1[7], step1[6]);
+  step2[7] = vaddq_s32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s32(step2[0], step2[3]);
+  step1[1] = vaddq_s32(step2[1], step2[2]);
+  step1[2] = vsubq_s32(step2[1], step2[2]);
+  step1[3] = vsubq_s32(step2[0], step2[3]);
+
+  step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
+  step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+  step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+
+  // stage 4
+  *io0 = vaddq_s32(step1[0], step2[7]);
+  *io1 = vaddq_s32(step1[1], step1[6]);
+  *io2 = vaddq_s32(step1[2], step1[5]);
+  *io3 = vaddq_s32(step1[3], step2[4]);
+  *io4 = vsubq_s32(step1[3], step2[4]);
+  *io5 = vsubq_s32(step1[2], step1[5]);
+  *io6 = vsubq_s32(step1[1], step1[6]);
+  *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_half1d_bd12(
+    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+    int32x4_t *const io7) {
+  int32x2_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
+      input_7l, input_7h;
+  int32x2_t step1l[4], step1h[4];
+  int32x4_t step1[8], step2[8];
+  int64x2_t t64[8];
+  int32x2_t t32[8];
+
+  transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
+
+  // stage 1
+  input_1l = vget_low_s32(*io1);
+  input_1h = vget_high_s32(*io1);
+  input_3l = vget_low_s32(*io3);
+  input_3h = vget_high_s32(*io3);
+  input_5l = vget_low_s32(*io5);
+  input_5h = vget_high_s32(*io5);
+  input_7l = vget_low_s32(*io7);
+  input_7h = vget_high_s32(*io7);
+  step1l[0] = vget_low_s32(*io0);
+  step1h[0] = vget_high_s32(*io0);
+  step1l[1] = vget_low_s32(*io2);
+  step1h[1] = vget_high_s32(*io2);
+  step1l[2] = vget_low_s32(*io4);
+  step1h[2] = vget_high_s32(*io4);
+  step1l[3] = vget_low_s32(*io6);
+  step1h[3] = vget_high_s32(*io6);
+
+  t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
+  t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
+  t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
+  t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
+  t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
+  t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
+  t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
+  t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
+  t64[0] = vmlsl_lane_s32(t64[0], input_7l, vget_low_s32(cospis1), 0);
+  t64[1] = vmlsl_lane_s32(t64[1], input_7h, vget_low_s32(cospis1), 0);
+  t64[2] = vmlal_lane_s32(t64[2], input_5l, vget_low_s32(cospis1), 1);
+  t64[3] = vmlal_lane_s32(t64[3], input_5h, vget_low_s32(cospis1), 1);
+  t64[4] = vmlsl_lane_s32(t64[4], input_5l, vget_high_s32(cospis1), 0);
+  t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0);
+  t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1);
+  t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+  step1[4] = vcombine_s32(t32[0], t32[1]);
+  step1[5] = vcombine_s32(t32[2], t32[3]);
+  step1[6] = vcombine_s32(t32[4], t32[5]);
+  step1[7] = vcombine_s32(t32[6], t32[7]);
+
+  // stage 2
+  t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
+  t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
+  t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
+  t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
+  t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
+  t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
+  t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
+  t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
+  t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
+  t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
+  t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1);
+  t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
+  t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
+  t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+  step2[0] = vcombine_s32(t32[0], t32[1]);
+  step2[1] = vcombine_s32(t32[2], t32[3]);
+  step2[2] = vcombine_s32(t32[4], t32[5]);
+  step2[3] = vcombine_s32(t32[6], t32[7]);
+
+  step2[4] = vaddq_s32(step1[4], step1[5]);
+  step2[5] = vsubq_s32(step1[4], step1[5]);
+  step2[6] = vsubq_s32(step1[7], step1[6]);
+  step2[7] = vaddq_s32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s32(step2[0], step2[3]);
+  step1[1] = vaddq_s32(step2[1], step2[2]);
+  step1[2] = vsubq_s32(step2[1], step2[2]);
+  step1[3] = vsubq_s32(step2[0], step2[3]);
+
+  t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
+  t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
+  t64[0] =
+      vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+  t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
+                          vget_high_s32(cospis0), 0);
+  t64[2] =
+      vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+  t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
+                          vget_high_s32(cospis0), 0);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  step1[5] = vcombine_s32(t32[0], t32[1]);
+  step1[6] = vcombine_s32(t32[2], t32[3]);
+
+  // stage 4
+  *io0 = vaddq_s32(step1[0], step2[7]);
+  *io1 = vaddq_s32(step1[1], step1[6]);
+  *io2 = vaddq_s32(step1[2], step1[5]);
+  *io3 = vaddq_s32(step1[3], step2[4]);
+  *io4 = vsubq_s32(step1[3], step2[4]);
+  *io5 = vsubq_s32(step1[2], step1[5]);
+  *io6 = vsubq_s32(step1[1], step1[6]);
+  *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
+                                    int stride, int bd) {
+  int32x4_t a0 = vld1q_s32(input);
+  int32x4_t a1 = vld1q_s32(input + 4);
+  int32x4_t a2 = vld1q_s32(input + 8);
+  int32x4_t a3 = vld1q_s32(input + 12);
+  int32x4_t a4 = vld1q_s32(input + 16);
+  int32x4_t a5 = vld1q_s32(input + 20);
+  int32x4_t a6 = vld1q_s32(input + 24);
+  int32x4_t a7 = vld1q_s32(input + 28);
+  int32x4_t a8 = vld1q_s32(input + 32);
+  int32x4_t a9 = vld1q_s32(input + 36);
+  int32x4_t a10 = vld1q_s32(input + 40);
+  int32x4_t a11 = vld1q_s32(input + 44);
+  int32x4_t a12 = vld1q_s32(input + 48);
+  int32x4_t a13 = vld1q_s32(input + 52);
+  int32x4_t a14 = vld1q_s32(input + 56);
+  int32x4_t a15 = vld1q_s32(input + 60);
+  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+
+  if (bd == 8) {
+    const int16x8_t cospis = vld1q_s16(kCospi);
+    const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+    const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+    int16x8_t b0 = vcombine_s16(vmovn_s32(a0), vmovn_s32(a1));
+    int16x8_t b1 = vcombine_s16(vmovn_s32(a2), vmovn_s32(a3));
+    int16x8_t b2 = vcombine_s16(vmovn_s32(a4), vmovn_s32(a5));
+    int16x8_t b3 = vcombine_s16(vmovn_s32(a6), vmovn_s32(a7));
+    int16x8_t b4 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a9));
+    int16x8_t b5 = vcombine_s16(vmovn_s32(a10), vmovn_s32(a11));
+    int16x8_t b6 = vcombine_s16(vmovn_s32(a12), vmovn_s32(a13));
+    int16x8_t b7 = vcombine_s16(vmovn_s32(a14), vmovn_s32(a15));
+
+    idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
+    idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
+
+    c0 = vrshrq_n_s16(b0, 5);
+    c1 = vrshrq_n_s16(b1, 5);
+    c2 = vrshrq_n_s16(b2, 5);
+    c3 = vrshrq_n_s16(b3, 5);
+    c4 = vrshrq_n_s16(b4, 5);
+    c5 = vrshrq_n_s16(b5, 5);
+    c6 = vrshrq_n_s16(b6, 5);
+    c7 = vrshrq_n_s16(b7, 5);
+  } else {
+    const int32x4_t cospis0 = vld1q_s32(kCospi32);      // cospi 0, 8, 16, 24
+    const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
+
+    if (bd == 10) {
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
+                             &a6, &a7);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
+                             &a14, &a15);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
+                             &a3, &a11);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
+                             &a7, &a15);
+    } else {
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
+                             &a6, &a7);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
+                             &a14, &a15);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
+                             &a3, &a11);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
+                             &a7, &a15);
+    }
+    c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
+    c1 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
+    c2 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
+    c3 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
+    c4 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
+    c5 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
+    c6 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
+    c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
+  }
+  highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
+}
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c
new file mode 100644
index 0000000000..6f7e5da762
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -0,0 +1,1078 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) {
+  const uint16x4_t ref_u16 = vld1_u16(ref);
+  const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16);
+  return vpadd_u16(p0, p0);
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+                                const uint16x4_t dc) {
+  const uint16x4_t dc_dup = vdup_lane_u16(dc, 0);
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    vst1_u16(dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  const uint16x4_t a = vld1_u16(above);
+  const uint16x4_t l = vld1_u16(left);
+  uint16x4_t sum;
+  uint16x4_t dc;
+  (void)bd;
+  sum = vadd_u16(a, l);
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vrshr_n_u16(sum, 3);
+  dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_4(left);
+  const uint16x4_t dc = vrshr_n_u16(sum, 2);
+  (void)above;
+  (void)bd;
+  dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_4(above);
+  const uint16x4_t dc = vrshr_n_u16(sum, 2);
+  (void)left;
+  (void)bd;
+  dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  (void)above;
+  (void)left;
+  dc_store_4x4(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) {
+  const uint16x8_t ref_u16 = vld1q_u16(ref);
+  uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16));
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
+
+static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
+                                const uint16x4_t dc) {
+  const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
+  int i;
+  for (i = 0; i < 8; ++i, dst += stride) {
+    vst1q_u16(dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  const uint16x8_t above_u16 = vld1q_u16(above);
+  const uint16x8_t left_u16 = vld1q_u16(left);
+  const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+  uint16x4_t dc;
+  (void)bd;
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vrshr_n_u16(sum, 4);
+  dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_8(left);
+  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  (void)above;
+  (void)bd;
+  dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_8(above);
+  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  (void)left;
+  (void)bd;
+  dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  (void)above;
+  (void)left;
+  dc_store_8x8(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) {
+  const uint16x8x2_t ref_u16 = vld2q_u16(ref);
+  const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
+
+static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
+                                  const uint16x4_t dc) {
+  uint16x8x2_t dc_dup;
+  int i;
+  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
+  for (i = 0; i < 16; ++i, dst += stride) {
+    vst2q_u16(dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const uint16x8x2_t a = vld2q_u16(above);
+  const uint16x8x2_t l = vld2q_u16(left);
+  const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]);
+  const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]);
+  const uint16x8_t pal0 = vaddq_u16(pa, pl);
+  uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
+  uint32x2_t sum;
+  uint16x4_t dc;
+  (void)bd;
+  pal1 = vpadd_u16(pal1, pal1);
+  sum = vpaddl_u16(pal1);
+  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_16(left);
+  const uint16x4_t dc = vrshr_n_u16(sum, 4);
+  (void)above;
+  (void)bd;
+  dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_16(above);
+  const uint16x4_t dc = vrshr_n_u16(sum, 4);
+  (void)left;
+  (void)bd;
+  dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  (void)above;
+  (void)left;
+  dc_store_16x16(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) {
+  const uint16x8x4_t r = vld4q_u16(ref);
+  const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]);
+  const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]);
+  const uint16x8_t p2 = vaddq_u16(p0, p1);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+  sum = vpadd_u16(sum, sum);
+  return vpaddl_u16(sum);
+}
+
+static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
+                                  const uint16x4_t dc) {
+  uint16x8x2_t dc_dup;
+  int i;
+  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
+
+  for (i = 0; i < 32; ++i) {
+    vst2q_u16(dst, dc_dup);
+    dst += 16;
+    vst2q_u16(dst, dc_dup);
+    dst += stride - 16;
+  }
+}
+
+void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const uint16x8x4_t a = vld4q_u16(above);
+  const uint16x8x4_t l = vld4q_u16(left);
+  const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]);
+  const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]);
+  const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]);
+  const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]);
+  const uint16x8_t pa = vaddq_u16(pa0, pa1);
+  const uint16x8_t pl = vaddq_u16(pl0, pl1);
+  const uint16x8_t pal0 = vaddq_u16(pa, pl);
+  const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
+  uint32x2_t sum = vpaddl_u16(pal1);
+  uint16x4_t dc;
+  (void)bd;
+  sum = vpadd_u32(sum, sum);
+  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6));
+  dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const uint32x2_t sum = dc_sum_32(left);
+  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  (void)above;
+  (void)bd;
+  dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const uint32x2_t sum = dc_sum_32(above);
+  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  (void)left;
+  (void)bd;
+  dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  (void)above;
+  (void)left;
+  dc_store_32x32(dst, stride, dc);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8_t ABCDEFGH = vld1q_u16(above);
+  const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1);
+  const uint16x8_t CDEFGH00 = vld1q_u16(above + 2);
+  const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00);
+  const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0);
+  const uint16x4_t avg2_low = vget_low_u16(avg2);
+  const uint16x4_t avg2_high = vget_high_u16(avg2);
+  const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1);
+  const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2);
+  const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3);
+  (void)left;
+  (void)bd;
+  vst1_u16(dst, avg2_low);
+  dst += stride;
+  vst1_u16(dst, r1);
+  dst += stride;
+  vst1_u16(dst, r2);
+  dst += stride;
+  vst1_u16(dst, r3);
+  vst1q_lane_u16(dst + 3, ABCDEFGH, 7);
+}
+
+static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
+                               const uint16x8_t above_right, uint16x8_t *row) {
+  *row = vextq_u16(*row, above_right, 1);
+  vst1q_u16(*dst, *row);
+  *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8_t A0 = vld1q_u16(above);
+  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3);
+  const uint16x8_t A1 = vld1q_u16(above + 1);
+  const uint16x8_t A2 = vld1q_u16(above + 2);
+  const uint16x8_t avg1 = vhaddq_u16(A0, A2);
+  uint16x8_t row = vrhaddq_u16(avg1, A1);
+  (void)left;
+  (void)bd;
+
+  vst1q_u16(dst, row);
+  dst += stride;
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  vst1q_u16(dst, above_right);
+}
+
+static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
+                                const uint16x8_t above_right, uint16x8_t *row_0,
+                                uint16x8_t *row_1) {
+  *row_0 = vextq_u16(*row_0, *row_1, 1);
+  *row_1 = vextq_u16(*row_1, above_right, 1);
+  vst1q_u16(*dst, *row_0);
+  *dst += 8;
+  vst1q_u16(*dst, *row_1);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const uint16x8_t A0_0 = vld1q_u16(above);
+  const uint16x8_t A0_1 = vld1q_u16(above + 8);
+  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3);
+  const uint16x8_t A1_0 = vld1q_u16(above + 1);
+  const uint16x8_t A1_1 = vld1q_u16(above + 9);
+  const uint16x8_t A2_0 = vld1q_u16(above + 2);
+  const uint16x8_t A2_1 = vld1q_u16(above + 10);
+  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
+  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
+  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
+  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+  (void)left;
+  (void)bd;
+
+  vst1q_u16(dst, row_0);
+  vst1q_u16(dst + 8, row_1);
+  dst += stride;
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  vst1q_u16(dst, above_right);
+  vst1q_u16(dst + 8, above_right);
+}
+
+void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const uint16x8_t A0_0 = vld1q_u16(above);
+  const uint16x8_t A0_1 = vld1q_u16(above + 8);
+  const uint16x8_t A0_2 = vld1q_u16(above + 16);
+  const uint16x8_t A0_3 = vld1q_u16(above + 24);
+  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3);
+  const uint16x8_t A1_0 = vld1q_u16(above + 1);
+  const uint16x8_t A1_1 = vld1q_u16(above + 9);
+  const uint16x8_t A1_2 = vld1q_u16(above + 17);
+  const uint16x8_t A1_3 = vld1q_u16(above + 25);
+  const uint16x8_t A2_0 = vld1q_u16(above + 2);
+  const uint16x8_t A2_1 = vld1q_u16(above + 10);
+  const uint16x8_t A2_2 = vld1q_u16(above + 18);
+  const uint16x8_t A2_3 = vld1q_u16(above + 26);
+  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
+  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
+  const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2);
+  const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3);
+  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
+  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+  uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2);
+  uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3);
+  int i;
+  (void)left;
+  (void)bd;
+
+  vst1q_u16(dst, row_0);
+  dst += 8;
+  vst1q_u16(dst, row_1);
+  dst += 8;
+  vst1q_u16(dst, row_2);
+  dst += 8;
+  vst1q_u16(dst, row_3);
+  dst += stride - 24;
+
+  for (i = 0; i < 30; ++i) {
+    row_0 = vextq_u16(row_0, row_1, 1);
+    row_1 = vextq_u16(row_1, row_2, 1);
+    row_2 = vextq_u16(row_2, row_3, 1);
+    row_3 = vextq_u16(row_3, above_right, 1);
+    vst1q_u16(dst, row_0);
+    dst += 8;
+    vst1q_u16(dst, row_1);
+    dst += 8;
+    vst1q_u16(dst, row_2);
+    dst += 8;
+    vst1q_u16(dst, row_3);
+    dst += stride - 24;
+  }
+
+  vst1q_u16(dst, above_right);
+  dst += 8;
+  vst1q_u16(dst, above_right);
+  dst += 8;
+  vst1q_u16(dst, above_right);
+  dst += 8;
+  vst1q_u16(dst, above_right);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const uint16x8_t XA0123___ = vld1q_u16(above - 1);
+  const uint16x4_t L0123 = vld1_u16(left);
+  const uint16x4_t L3210 = vrev64_u16(L0123);
+  const uint16x8_t L____3210 = vcombine_u16(L0123, L3210);
+  const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___));
+  const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5);
+  const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6);
+  const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_);
+  const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123);
+  const uint16x4_t row_0 = vget_low_u16(avg2);
+  const uint16x4_t row_1 = vget_high_u16(avg2);
+  const uint16x4_t r0 = vext_u16(row_0, row_1, 3);
+  const uint16x4_t r1 = vext_u16(row_0, row_1, 2);
+  const uint16x4_t r2 = vext_u16(row_0, row_1, 1);
+  (void)bd;
+  vst1_u16(dst, r0);
+  dst += stride;
+  vst1_u16(dst, r1);
+  dst += stride;
+  vst1_u16(dst, r2);
+  dst += stride;
+  vst1_u16(dst, row_0);
+}
+
+void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+  const uint16x8_t A01234567 = vld1q_u16(above);
+  const uint16x8_t A1234567_ = vld1q_u16(above + 1);
+  const uint16x8_t L01234567 = vld1q_u16(left);
+  const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+  const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+  const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+  const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+  const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+  const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0);
+  const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_);
+  const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X);
+  const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567);
+  const uint16x8_t r0 = vextq_u16(row_0, row_1, 7);
+  const uint16x8_t r1 = vextq_u16(row_0, row_1, 6);
+  const uint16x8_t r2 = vextq_u16(row_0, row_1, 5);
+  const uint16x8_t r3 = vextq_u16(row_0, row_1, 4);
+  const uint16x8_t r4 = vextq_u16(row_0, row_1, 3);
+  const uint16x8_t r5 = vextq_u16(row_0, row_1, 2);
+  const uint16x8_t r6 = vextq_u16(row_0, row_1, 1);
+  (void)bd;
+  vst1q_u16(dst, r0);
+  dst += stride;
+  vst1q_u16(dst, r1);
+  dst += stride;
+  vst1q_u16(dst, r2);
+  dst += stride;
+  vst1q_u16(dst, r3);
+  dst += stride;
+  vst1q_u16(dst, r4);
+  dst += stride;
+  vst1q_u16(dst, r5);
+  dst += stride;
+  vst1q_u16(dst, r6);
+  dst += stride;
+  vst1q_u16(dst, row_0);
+}
+
+static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride,
+                                 const uint16x8_t row_0,
+                                 const uint16x8_t row_1) {
+  vst1q_u16(*dst, row_0);
+  *dst += 8;
+  vst1q_u16(*dst, row_1);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x8_t L01234567 = vld1q_u16(left);
+  const uint16x8_t L89abcdef = vld1q_u16(left + 8);
+  const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+  const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+  const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef));
+  const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef));
+  const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+  const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98);
+  const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1);
+  const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2);
+  const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876);
+  const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987);
+
+  const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+  const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+  const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+  const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0);
+  const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X);
+
+  const uint16x8_t A01234567 = vld1q_u16(above);
+  const uint16x8_t A12345678 = vld1q_u16(above + 1);
+  const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678);
+  const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567);
+
+  const uint16x8_t A789abcde = vld1q_u16(above + 7);
+  const uint16x8_t A89abcdef = vld1q_u16(above + 8);
+  const uint16x8_t A9abcdef_ = vld1q_u16(above + 9);
+  const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_);
+  const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef);
+
+  const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7);
+  const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7);
+  const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6);
+  const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6);
+  const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5);
+  const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5);
+  const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4);
+  const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4);
+  const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3);
+  const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3);
+  const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2);
+  const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2);
+  const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1);
+  const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1);
+  const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7);
+  const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6);
+  const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5);
+  const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4);
+  const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3);
+  const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2);
+  const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1);
+  (void)bd;
+
+  d135_store_16(&dst, stride, r0_0, r0_1);
+  d135_store_16(&dst, stride, r1_0, r1_1);
+  d135_store_16(&dst, stride, r2_0, r2_1);
+  d135_store_16(&dst, stride, r3_0, r3_1);
+  d135_store_16(&dst, stride, r4_0, r4_1);
+  d135_store_16(&dst, stride, r5_0, r5_1);
+  d135_store_16(&dst, stride, r6_0, r6_1);
+  d135_store_16(&dst, stride, row_1, row_2);
+  d135_store_16(&dst, stride, r8_0, r0_0);
+  d135_store_16(&dst, stride, r9_0, r1_0);
+  d135_store_16(&dst, stride, ra_0, r2_0);
+  d135_store_16(&dst, stride, rb_0, r3_0);
+  d135_store_16(&dst, stride, rc_0, r4_0);
+  d135_store_16(&dst, stride, rd_0, r5_0);
+  d135_store_16(&dst, stride, re_0, r6_0);
+  vst1q_u16(dst, row_0);
+  dst += 8;
+  vst1q_u16(dst, row_1);
+}
+
+void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x8_t LL01234567 = vld1q_u16(left + 16);
+  const uint16x8_t LL89abcdef = vld1q_u16(left + 24);
+  const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567));
+  const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567));
+  const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef));
+  const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef));
+  const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210);
+  const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98);
+  const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1);
+  const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2);
+  const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876);
+  uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987);
+
+  const uint16x8_t LU01234567 = vld1q_u16(left);
+  const uint16x8_t LU89abcdef = vld1q_u16(left + 8);
+  const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567));
+  const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567));
+  const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef));
+  const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef));
+  const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210);
+  const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98);
+  const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1);
+  const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2);
+  const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe);
+  uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf);
+
+  const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1);
+  const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2);
+  const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876);
+  uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987);
+
+  const uint16x8_t XAL0123456 = vld1q_u16(above - 1);
+  const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1);
+  const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2);
+  const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0);
+  uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X);
+
+  const uint16x8_t AL01234567 = vld1q_u16(above);
+  const uint16x8_t AL12345678 = vld1q_u16(above + 1);
+  const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678);
+  uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567);
+
+  const uint16x8_t AL789abcde = vld1q_u16(above + 7);
+  const uint16x8_t AL89abcdef = vld1q_u16(above + 8);
+  const uint16x8_t AL9abcdefg = vld1q_u16(above + 9);
+  const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg);
+  uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef);
+
+  const uint16x8_t ALfR0123456 = vld1q_u16(above + 15);
+  const uint16x8_t AR01234567 = vld1q_u16(above + 16);
+  const uint16x8_t AR12345678 = vld1q_u16(above + 17);
+  const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678);
+  uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567);
+
+  const uint16x8_t AR789abcde = vld1q_u16(above + 23);
+  const uint16x8_t AR89abcdef = vld1q_u16(above + 24);
+  const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25);
+  const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_);
+  uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef);
+  int i, j;
+  (void)bd;
+
+  dst += 31 * stride;
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 8; ++j) {
+      vst1q_u16(dst, row_0);
+      dst += 8;
+      vst1q_u16(dst, row_1);
+      dst += 8;
+      vst1q_u16(dst, row_2);
+      dst += 8;
+      vst1q_u16(dst, row_3);
+      dst -= stride + 24;
+      row_0 = vextq_u16(row_0, row_1, 1);
+      row_1 = vextq_u16(row_1, row_2, 1);
+      row_2 = vextq_u16(row_2, row_3, 1);
+      row_3 = vextq_u16(row_3, row_4, 1);
+      row_4 = vextq_u16(row_4, row_4, 1);
+    }
+    row_4 = row_5;
+    row_5 = row_6;
+    row_6 = row_7;
+  }
+}
+
+//------------------------------------------------------------------------------
+
+void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x4_t row = vld1_u16(above);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, dst += stride) {
+    vst1_u16(dst, row);
+  }
+}
+
+void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x8_t row = vld1q_u16(above);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 8; i++, dst += stride) {
+    vst1q_u16(dst, row);
+  }
+}
+
+void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8x2_t row = vld2q_u16(above);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 16; i++, dst += stride) {
+    vst2q_u16(dst, row);
+  }
+}
+
+void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8x2_t row0 = vld2q_u16(above);
+  const uint16x8x2_t row1 = vld2q_u16(above + 16);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 32; i++) {
+    vst2q_u16(dst, row0);
+    dst += 16;
+    vst2q_u16(dst, row1);
+    dst += stride - 16;
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x4_t left_u16 = vld1_u16(left);
+  uint16x4_t row;
+  (void)above;
+  (void)bd;
+
+  row = vdup_lane_u16(left_u16, 0);
+  vst1_u16(dst, row);
+  dst += stride;
+  row = vdup_lane_u16(left_u16, 1);
+  vst1_u16(dst, row);
+  dst += stride;
+  row = vdup_lane_u16(left_u16, 2);
+  vst1_u16(dst, row);
+  dst += stride;
+  row = vdup_lane_u16(left_u16, 3);
+  vst1_u16(dst, row);
+}
+
+void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x8_t left_u16 = vld1q_u16(left);
+  const uint16x4_t left_low = vget_low_u16(left_u16);
+  const uint16x4_t left_high = vget_high_u16(left_u16);
+  uint16x8_t row;
+  (void)above;
+  (void)bd;
+
+  row = vdupq_lane_u16(left_low, 0);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_low, 1);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_low, 2);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_low, 3);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 0);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 1);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 2);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 3);
+  vst1q_u16(dst, row);
+}
+
+static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride,
+                              const uint16x8_t row) {
+  // Note: vst1q is faster than vst2q
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    const uint16x8_t left_u16q = vld1q_u16(left);
+    const uint16x4_t left_low = vget_low_u16(left_u16q);
+    const uint16x4_t left_high = vget_high_u16(left_u16q);
+    uint16x8_t row;
+
+    row = vdupq_lane_u16(left_low, 0);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 1);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 2);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 3);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 0);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 1);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 2);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 3);
+    h_store_16(&dst, stride, row);
+  }
+}
+
+static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride,
+                              const uint16x8_t row) {
+  // Note: vst1q is faster than vst2q
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += stride - 24;
+}
+
+void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    const uint16x8_t left_u16q = vld1q_u16(left);
+    const uint16x4_t left_low = vget_low_u16(left_u16q);
+    const uint16x4_t left_high = vget_high_u16(left_u16q);
+    uint16x8_t row;
+
+    row = vdupq_lane_u16(left_low, 0);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 1);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 2);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 3);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 0);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 1);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 2);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 3);
+    h_store_32(&dst, stride, row);
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x4_t above_s16d = vld1_s16((const int16_t *)above);
+  const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d);
+  const int16x4_t left_s16 = vld1_s16((const int16_t *)left);
+  const int16x8_t sub = vsubq_s16(above_s16, top_left);
+  int16x8_t sum;
+  uint16x8_t row;
+
+  sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
+  sum = vaddq_s16(sum, sub);
+  sum = vminq_s16(sum, max);
+  row = vqshluq_n_s16(sum, 0);
+  vst1_u16(dst, vget_low_u16(row));
+  dst += stride;
+  vst1_u16(dst, vget_high_u16(row));
+  dst += stride;
+
+  sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
+  sum = vaddq_s16(sum, sub);
+  sum = vminq_s16(sum, max);
+  row = vqshluq_n_s16(sum, 0);
+  vst1_u16(dst, vget_low_u16(row));
+  dst += stride;
+  vst1_u16(dst, vget_high_u16(row));
+}
+
+static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride,
+                               const int16x8_t left_dup, const int16x8_t sub,
+                               const int16x8_t max) {
+  uint16x8_t row;
+  int16x8_t sum = vaddq_s16(left_dup, sub);
+  sum = vminq_s16(sum, max);
+  row = vqshluq_n_s16(sum, 0);
+  vst1q_u16(*dst, row);
+  *dst += stride;
+}
+
+void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x8_t above_s16 = vld1q_s16((const int16_t *)above);
+  const int16x8_t left_s16 = vld1q_s16((const int16_t *)left);
+  const int16x8_t sub = vsubq_s16(above_s16, top_left);
+  int16x4_t left_s16d;
+  int16x8_t left_dup;
+  int i;
+
+  left_s16d = vget_low_s16(left_s16);
+
+  for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) {
+    left_dup = vdupq_lane_s16(left_s16d, 0);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+    left_dup = vdupq_lane_s16(left_s16d, 1);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+    left_dup = vdupq_lane_s16(left_s16d, 2);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+    left_dup = vdupq_lane_s16(left_s16d, 3);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+  }
+}
+
+static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride,
+                                const int16x8_t left_dup, const int16x8_t sub0,
+                                const int16x8_t sub1, const int16x8_t max) {
+  uint16x8_t row0, row1;
+  int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+  int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+  sum0 = vminq_s16(sum0, max);
+  sum1 = vminq_s16(sum1, max);
+  row0 = vqshluq_n_s16(sum0, 0);
+  row1 = vqshluq_n_s16(sum1, 0);
+  vst1q_u16(*dst, row0);
+  *dst += 8;
+  vst1q_u16(*dst, row1);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+  const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+  const int16x8_t sub0 = vsubq_s16(above0, top_left);
+  const int16x8_t sub1 = vsubq_s16(above1, top_left);
+  int16x8_t left_dup;
+  int i, j;
+
+  for (j = 0; j < 2; j++, left += 8) {
+    const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+    int16x4_t left_s16d = vget_low_s16(left_s16q);
+    for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+      left_dup = vdupq_lane_s16(left_s16d, 0);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 1);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 2);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 3);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+    }
+  }
+}
+
+static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride,
+                                const int16x8_t left_dup, const int16x8_t sub0,
+                                const int16x8_t sub1, const int16x8_t sub2,
+                                const int16x8_t sub3, const int16x8_t max) {
+  uint16x8_t row0, row1, row2, row3;
+  int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+  int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+  int16x8_t sum2 = vaddq_s16(left_dup, sub2);
+  int16x8_t sum3 = vaddq_s16(left_dup, sub3);
+  sum0 = vminq_s16(sum0, max);
+  sum1 = vminq_s16(sum1, max);
+  sum2 = vminq_s16(sum2, max);
+  sum3 = vminq_s16(sum3, max);
+  row0 = vqshluq_n_s16(sum0, 0);
+  row1 = vqshluq_n_s16(sum1, 0);
+  row2 = vqshluq_n_s16(sum2, 0);
+  row3 = vqshluq_n_s16(sum3, 0);
+  vst1q_u16(*dst, row0);
+  *dst += 8;
+  vst1q_u16(*dst, row1);
+  *dst += 8;
+  vst1q_u16(*dst, row2);
+  *dst += 8;
+  vst1q_u16(*dst, row3);
+  *dst += stride - 24;
+}
+
+void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+  const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+  const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16));
+  const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24));
+  const int16x8_t sub0 = vsubq_s16(above0, top_left);
+  const int16x8_t sub1 = vsubq_s16(above1, top_left);
+  const int16x8_t sub2 = vsubq_s16(above2, top_left);
+  const int16x8_t sub3 = vsubq_s16(above3, top_left);
+  int16x8_t left_dup;
+  int i, j;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+    int16x4_t left_s16d = vget_low_s16(left_s16q);
+    for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) {
+      left_dup = vdupq_lane_s16(left_s16d, 0);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 1);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 2);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 3);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c
new file mode 100644
index 0000000000..5530c6425b
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c
@@ -0,0 +1,761 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+static INLINE void load_thresh(const uint8_t *blimit, const uint8_t *limit,
+                               const uint8_t *thresh, uint16x8_t *blimit_vec,
+                               uint16x8_t *limit_vec, uint16x8_t *thresh_vec,
+                               const int bd) {
+  const int16x8_t shift = vdupq_n_s16(bd - 8);
+  *blimit_vec = vmovl_u8(vld1_dup_u8(blimit));
+  *limit_vec = vmovl_u8(vld1_dup_u8(limit));
+  *thresh_vec = vmovl_u8(vld1_dup_u8(thresh));
+  *blimit_vec = vshlq_u16(*blimit_vec, shift);
+  *limit_vec = vshlq_u16(*limit_vec, shift);
+  *thresh_vec = vshlq_u16(*thresh_vec, shift);
+}
+
+// Here flat is 128-bit long, with each 16-bit chunk being a mask of
+// a pixel. When used to control filter branches, we only detect whether it is
+// all 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
+// flat equals 0 if and only if flat_status equals 0.
+// flat equals -1 (all 1s) if and only if flat_status equals -4. (This is true
+// because each mask occupies more than 1 bit.)
+static INLINE uint32_t calc_flat_status(const uint16x8_t flat) {
+  const uint64x1_t t0 = vadd_u64(vreinterpret_u64_u16(vget_low_u16(flat)),
+                                 vreinterpret_u64_u16(vget_high_u16(flat)));
+  const uint64x1_t t1 = vpaddl_u32(vreinterpret_u32_u64(t0));
+  return vget_lane_u32(vreinterpret_u32_u64(t1), 0);
+}
+
+static INLINE uint16x8_t
+filter_hev_mask4(const uint16x8_t limit, const uint16x8_t blimit,
+                 const uint16x8_t thresh, const uint16x8_t p3,
+                 const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0,
+                 const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2,
+                 const uint16x8_t q3, uint16x8_t *hev, uint16x8_t *mask) {
+  uint16x8_t max, t0, t1;
+
+  max = vabdq_u16(p1, p0);
+  max = vmaxq_u16(max, vabdq_u16(q1, q0));
+  *hev = vcgtq_u16(max, thresh);
+  *mask = vmaxq_u16(max, vabdq_u16(p3, p2));
+  *mask = vmaxq_u16(*mask, vabdq_u16(p2, p1));
+  *mask = vmaxq_u16(*mask, vabdq_u16(q2, q1));
+  *mask = vmaxq_u16(*mask, vabdq_u16(q3, q2));
+  t0 = vabdq_u16(p0, q0);
+  t1 = vabdq_u16(p1, q1);
+  t0 = vaddq_u16(t0, t0);
+  t1 = vshrq_n_u16(t1, 1);
+  t0 = vaddq_u16(t0, t1);
+  *mask = vcleq_u16(*mask, limit);
+  t0 = vcleq_u16(t0, blimit);
+  *mask = vandq_u16(*mask, t0);
+
+  return max;
+}
+
+static INLINE uint16x8_t filter_flat_hev_mask(
+    const uint16x8_t limit, const uint16x8_t blimit, const uint16x8_t thresh,
+    const uint16x8_t p3, const uint16x8_t p2, const uint16x8_t p1,
+    const uint16x8_t p0, const uint16x8_t q0, const uint16x8_t q1,
+    const uint16x8_t q2, const uint16x8_t q3, uint16x8_t *flat,
+    uint32_t *flat_status, uint16x8_t *hev, const int bd) {
+  uint16x8_t mask;
+  const uint16x8_t max = filter_hev_mask4(limit, blimit, thresh, p3, p2, p1, p0,
+                                          q0, q1, q2, q3, hev, &mask);
+  *flat = vmaxq_u16(max, vabdq_u16(p2, p0));
+  *flat = vmaxq_u16(*flat, vabdq_u16(q2, q0));
+  *flat = vmaxq_u16(*flat, vabdq_u16(p3, p0));
+  *flat = vmaxq_u16(*flat, vabdq_u16(q3, q0));
+  *flat = vcleq_u16(*flat, vdupq_n_u16(1 << (bd - 8))); /* flat_mask4() */
+  *flat = vandq_u16(*flat, mask);
+  *flat_status = calc_flat_status(*flat);
+
+  return mask;
+}
+
+static INLINE uint16x8_t flat_mask5(const uint16x8_t p4, const uint16x8_t p3,
+                                    const uint16x8_t p2, const uint16x8_t p1,
+                                    const uint16x8_t p0, const uint16x8_t q0,
+                                    const uint16x8_t q1, const uint16x8_t q2,
+                                    const uint16x8_t q3, const uint16x8_t q4,
+                                    const uint16x8_t flat,
+                                    uint32_t *flat2_status, const int bd) {
+  uint16x8_t flat2 = vabdq_u16(p4, p0);
+  flat2 = vmaxq_u16(flat2, vabdq_u16(p3, p0));
+  flat2 = vmaxq_u16(flat2, vabdq_u16(p2, p0));
+  flat2 = vmaxq_u16(flat2, vabdq_u16(p1, p0));
+  flat2 = vmaxq_u16(flat2, vabdq_u16(q1, q0));
+  flat2 = vmaxq_u16(flat2, vabdq_u16(q2, q0));
+  flat2 = vmaxq_u16(flat2, vabdq_u16(q3, q0));
+  flat2 = vmaxq_u16(flat2, vabdq_u16(q4, q0));
+  flat2 = vcleq_u16(flat2, vdupq_n_u16(1 << (bd - 8)));
+  flat2 = vandq_u16(flat2, flat);
+  *flat2_status = calc_flat_status(flat2);
+
+  return flat2;
+}
+
+static INLINE int16x8_t flip_sign(const uint16x8_t v, const int bd) {
+  const uint16x8_t offset = vdupq_n_u16(0x80 << (bd - 8));
+  return vreinterpretq_s16_u16(vsubq_u16(v, offset));
+}
+
+static INLINE uint16x8_t flip_sign_back(const int16x8_t v, const int bd) {
+  const int16x8_t offset = vdupq_n_s16(0x80 << (bd - 8));
+  return vreinterpretq_u16_s16(vaddq_s16(v, offset));
+}
+
+static INLINE void filter_update(const uint16x8_t sub0, const uint16x8_t sub1,
+                                 const uint16x8_t add0, const uint16x8_t add1,
+                                 uint16x8_t *sum) {
+  *sum = vsubq_u16(*sum, sub0);
+  *sum = vsubq_u16(*sum, sub1);
+  *sum = vaddq_u16(*sum, add0);
+  *sum = vaddq_u16(*sum, add1);
+}
+
+static INLINE uint16x8_t calc_7_tap_filter_kernel(const uint16x8_t sub0,
+                                                  const uint16x8_t sub1,
+                                                  const uint16x8_t add0,
+                                                  const uint16x8_t add1,
+                                                  uint16x8_t *sum) {
+  filter_update(sub0, sub1, add0, add1, sum);
+  return vrshrq_n_u16(*sum, 3);
+}
+
+static INLINE uint16x8_t apply_15_tap_filter_kernel(
+    const uint16x8_t flat, const uint16x8_t sub0, const uint16x8_t sub1,
+    const uint16x8_t add0, const uint16x8_t add1, const uint16x8_t in,
+    uint16x8_t *sum) {
+  filter_update(sub0, sub1, add0, add1, sum);
+  return vbslq_u16(flat, vrshrq_n_u16(*sum, 4), in);
+}
+
+// 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+static INLINE void calc_7_tap_filter(const uint16x8_t p3, const uint16x8_t p2,
+                                     const uint16x8_t p1, const uint16x8_t p0,
+                                     const uint16x8_t q0, const uint16x8_t q1,
+                                     const uint16x8_t q2, const uint16x8_t q3,
+                                     uint16x8_t *op2, uint16x8_t *op1,
+                                     uint16x8_t *op0, uint16x8_t *oq0,
+                                     uint16x8_t *oq1, uint16x8_t *oq2) {
+  uint16x8_t sum;
+  sum = vaddq_u16(p3, p3);   // 2*p3
+  sum = vaddq_u16(sum, p3);  // 3*p3
+  sum = vaddq_u16(sum, p2);  // 3*p3+p2
+  sum = vaddq_u16(sum, p2);  // 3*p3+2*p2
+  sum = vaddq_u16(sum, p1);  // 3*p3+2*p2+p1
+  sum = vaddq_u16(sum, p0);  // 3*p3+2*p2+p1+p0
+  sum = vaddq_u16(sum, q0);  // 3*p3+2*p2+p1+p0+q0
+  *op2 = vrshrq_n_u16(sum, 3);
+  *op1 = calc_7_tap_filter_kernel(p3, p2, p1, q1, &sum);
+  *op0 = calc_7_tap_filter_kernel(p3, p1, p0, q2, &sum);
+  *oq0 = calc_7_tap_filter_kernel(p3, p0, q0, q3, &sum);
+  *oq1 = calc_7_tap_filter_kernel(p2, q0, q1, q3, &sum);
+  *oq2 = calc_7_tap_filter_kernel(p1, q1, q2, q3, &sum);
+}
+
+static INLINE void apply_7_tap_filter(const uint16x8_t flat,
+                                      const uint16x8_t p3, const uint16x8_t p2,
+                                      const uint16x8_t p1, const uint16x8_t p0,
+                                      const uint16x8_t q0, const uint16x8_t q1,
+                                      const uint16x8_t q2, const uint16x8_t q3,
+                                      uint16x8_t *op2, uint16x8_t *op1,
+                                      uint16x8_t *op0, uint16x8_t *oq0,
+                                      uint16x8_t *oq1, uint16x8_t *oq2) {
+  uint16x8_t tp1, tp0, tq0, tq1;
+  calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, &tq0, &tq1,
+                    oq2);
+  *op2 = vbslq_u16(flat, *op2, p2);
+  *op1 = vbslq_u16(flat, tp1, *op1);
+  *op0 = vbslq_u16(flat, tp0, *op0);
+  *oq0 = vbslq_u16(flat, tq0, *oq0);
+  *oq1 = vbslq_u16(flat, tq1, *oq1);
+  *oq2 = vbslq_u16(flat, *oq2, q2);
+}
+
+// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+static INLINE void apply_15_tap_filter(
+    const uint16x8_t flat2, const uint16x8_t p7, const uint16x8_t p6,
+    const uint16x8_t p5, const uint16x8_t p4, const uint16x8_t p3,
+    const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0,
+    const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2,
+    const uint16x8_t q3, const uint16x8_t q4, const uint16x8_t q5,
+    const uint16x8_t q6, const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5,
+    uint16x8_t *op4, uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1,
+    uint16x8_t *op0, uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2,
+    uint16x8_t *oq3, uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6) {
+  uint16x8_t sum;
+  sum = vshlq_n_u16(p7, 3);  // 8*p7
+  sum = vsubq_u16(sum, p7);  // 7*p7
+  sum = vaddq_u16(sum, p6);  // 7*p7+p6
+  sum = vaddq_u16(sum, p6);  // 7*p7+2*p6
+  sum = vaddq_u16(sum, p5);  // 7*p7+2*p6+p5
+  sum = vaddq_u16(sum, p4);  // 7*p7+2*p6+p5+p4
+  sum = vaddq_u16(sum, p3);  // 7*p7+2*p6+p5+p4+p3
+  sum = vaddq_u16(sum, p2);  // 7*p7+2*p6+p5+p4+p3+p2
+  sum = vaddq_u16(sum, p1);  // 7*p7+2*p6+p5+p4+p3+p2+p1
+  sum = vaddq_u16(sum, p0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
+  sum = vaddq_u16(sum, q0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
+  *op6 = vbslq_u16(flat2, vrshrq_n_u16(sum, 4), p6);
+  *op5 = apply_15_tap_filter_kernel(flat2, p7, p6, p5, q1, p5, &sum);
+  *op4 = apply_15_tap_filter_kernel(flat2, p7, p5, p4, q2, p4, &sum);
+  *op3 = apply_15_tap_filter_kernel(flat2, p7, p4, p3, q3, p3, &sum);
+  *op2 = apply_15_tap_filter_kernel(flat2, p7, p3, p2, q4, *op2, &sum);
+  *op1 = apply_15_tap_filter_kernel(flat2, p7, p2, p1, q5, *op1, &sum);
+  *op0 = apply_15_tap_filter_kernel(flat2, p7, p1, p0, q6, *op0, &sum);
+  *oq0 = apply_15_tap_filter_kernel(flat2, p7, p0, q0, q7, *oq0, &sum);
+  *oq1 = apply_15_tap_filter_kernel(flat2, p6, q0, q1, q7, *oq1, &sum);
+  *oq2 = apply_15_tap_filter_kernel(flat2, p5, q1, q2, q7, *oq2, &sum);
+  *oq3 = apply_15_tap_filter_kernel(flat2, p4, q2, q3, q7, q3, &sum);
+  *oq4 = apply_15_tap_filter_kernel(flat2, p3, q3, q4, q7, q4, &sum);
+  *oq5 = apply_15_tap_filter_kernel(flat2, p2, q4, q5, q7, q5, &sum);
+  *oq6 = apply_15_tap_filter_kernel(flat2, p1, q5, q6, q7, q6, &sum);
+}
+
+static INLINE void filter4(const uint16x8_t mask, const uint16x8_t hev,
+                           const uint16x8_t p1, const uint16x8_t p0,
+                           const uint16x8_t q0, const uint16x8_t q1,
+                           uint16x8_t *op1, uint16x8_t *op0, uint16x8_t *oq0,
+                           uint16x8_t *oq1, const int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << (bd - 1)) - 1);
+  const int16x8_t min = vdupq_n_s16((int16_t)(((uint32_t)-1) << (bd - 1)));
+  int16x8_t filter, filter1, filter2, t;
+  int16x8_t ps1 = flip_sign(p1, bd);
+  int16x8_t ps0 = flip_sign(p0, bd);
+  int16x8_t qs0 = flip_sign(q0, bd);
+  int16x8_t qs1 = flip_sign(q1, bd);
+
+  /* add outer taps if we have high edge variance */
+  filter = vsubq_s16(ps1, qs1);
+  filter = vmaxq_s16(filter, min);
+  filter = vminq_s16(filter, max);
+  filter = vandq_s16(filter, vreinterpretq_s16_u16(hev));
+  t = vsubq_s16(qs0, ps0);
+
+  /* inner taps */
+  filter = vaddq_s16(filter, t);
+  filter = vaddq_s16(filter, t);
+  filter = vaddq_s16(filter, t);
+  filter = vmaxq_s16(filter, min);
+  filter = vminq_s16(filter, max);
+  filter = vandq_s16(filter, vreinterpretq_s16_u16(mask));
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  /* if it equals 4 we'll set it to adjust by -1 to account for the fact */
+  /* we'd round it by 3 the other way */
+  t = vaddq_s16(filter, vdupq_n_s16(4));
+  t = vminq_s16(t, max);
+  filter1 = vshrq_n_s16(t, 3);
+  t = vaddq_s16(filter, vdupq_n_s16(3));
+  t = vminq_s16(t, max);
+  filter2 = vshrq_n_s16(t, 3);
+
+  qs0 = vsubq_s16(qs0, filter1);
+  qs0 = vmaxq_s16(qs0, min);
+  qs0 = vminq_s16(qs0, max);
+  ps0 = vaddq_s16(ps0, filter2);
+  ps0 = vmaxq_s16(ps0, min);
+  ps0 = vminq_s16(ps0, max);
+  *oq0 = flip_sign_back(qs0, bd);
+  *op0 = flip_sign_back(ps0, bd);
+
+  /* outer tap adjustments */
+  filter = vrshrq_n_s16(filter1, 1);
+  filter = vbicq_s16(filter, vreinterpretq_s16_u16(hev));
+
+  qs1 = vsubq_s16(qs1, filter);
+  qs1 = vmaxq_s16(qs1, min);
+  qs1 = vminq_s16(qs1, max);
+  ps1 = vaddq_s16(ps1, filter);
+  ps1 = vmaxq_s16(ps1, min);
+  ps1 = vminq_s16(ps1, max);
+  *oq1 = flip_sign_back(qs1, bd);
+  *op1 = flip_sign_back(ps1, bd);
+}
+
+static INLINE void filter8(const uint16x8_t mask, const uint16x8_t flat,
+                           const uint32_t flat_status, const uint16x8_t hev,
+                           const uint16x8_t p3, const uint16x8_t p2,
+                           const uint16x8_t p1, const uint16x8_t p0,
+                           const uint16x8_t q0, const uint16x8_t q1,
+                           const uint16x8_t q2, const uint16x8_t q3,
+                           uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0,
+                           uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2,
+                           const int bd) {
+  if (flat_status != (uint32_t)-4) {
+    filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd);
+    *op2 = p2;
+    *oq2 = q2;
+    if (flat_status) {
+      apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,
+                         oq0, oq1, oq2);
+    }
+  } else {
+    calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, oq0, oq1,
+                      oq2);
+  }
+}
+
+static INLINE void filter16(
+    const uint16x8_t mask, const uint16x8_t flat, const uint32_t flat_status,
+    const uint16x8_t flat2, const uint32_t flat2_status, const uint16x8_t hev,
+    const uint16x8_t p7, const uint16x8_t p6, const uint16x8_t p5,
+    const uint16x8_t p4, const uint16x8_t p3, const uint16x8_t p2,
+    const uint16x8_t p1, const uint16x8_t p0, const uint16x8_t q0,
+    const uint16x8_t q1, const uint16x8_t q2, const uint16x8_t q3,
+    const uint16x8_t q4, const uint16x8_t q5, const uint16x8_t q6,
+    const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5, uint16x8_t *op4,
+    uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0,
+    uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, uint16x8_t *oq3,
+    uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6, const int bd) {
+  if (flat_status != (uint32_t)-4) {
+    filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd);
+  }
+
+  if (flat_status) {
+    *op2 = p2;
+    *oq2 = q2;
+    if (flat2_status != (uint32_t)-4) {
+      apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,
+                         oq0, oq1, oq2);
+    }
+    if (flat2_status) {
+      apply_15_tap_filter(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3,
+                          q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0,
+                          oq0, oq1, oq2, oq3, oq4, oq5, oq6);
+    }
+  }
+}
+
+static INLINE void load_8x8(const uint16_t *s, const int p, uint16x8_t *p3,
+                            uint16x8_t *p2, uint16x8_t *p1, uint16x8_t *p0,
+                            uint16x8_t *q0, uint16x8_t *q1, uint16x8_t *q2,
+                            uint16x8_t *q3) {
+  *p3 = vld1q_u16(s);
+  s += p;
+  *p2 = vld1q_u16(s);
+  s += p;
+  *p1 = vld1q_u16(s);
+  s += p;
+  *p0 = vld1q_u16(s);
+  s += p;
+  *q0 = vld1q_u16(s);
+  s += p;
+  *q1 = vld1q_u16(s);
+  s += p;
+  *q2 = vld1q_u16(s);
+  s += p;
+  *q3 = vld1q_u16(s);
+}
+
+static INLINE void load_8x16(const uint16_t *s, const int p, uint16x8_t *s0,
+                             uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3,
+                             uint16x8_t *s4, uint16x8_t *s5, uint16x8_t *s6,
+                             uint16x8_t *s7, uint16x8_t *s8, uint16x8_t *s9,
+                             uint16x8_t *s10, uint16x8_t *s11, uint16x8_t *s12,
+                             uint16x8_t *s13, uint16x8_t *s14,
+                             uint16x8_t *s15) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+  *s4 = vld1q_u16(s);
+  s += p;
+  *s5 = vld1q_u16(s);
+  s += p;
+  *s6 = vld1q_u16(s);
+  s += p;
+  *s7 = vld1q_u16(s);
+  s += p;
+  *s8 = vld1q_u16(s);
+  s += p;
+  *s9 = vld1q_u16(s);
+  s += p;
+  *s10 = vld1q_u16(s);
+  s += p;
+  *s11 = vld1q_u16(s);
+  s += p;
+  *s12 = vld1q_u16(s);
+  s += p;
+  *s13 = vld1q_u16(s);
+  s += p;
+  *s14 = vld1q_u16(s);
+  s += p;
+  *s15 = vld1q_u16(s);
+}
+
+static INLINE void store_8x4(uint16_t *s, const int p, const uint16x8_t s0,
+                             const uint16x8_t s1, const uint16x8_t s2,
+                             const uint16x8_t s3) {
+  vst1q_u16(s, s0);
+  s += p;
+  vst1q_u16(s, s1);
+  s += p;
+  vst1q_u16(s, s2);
+  s += p;
+  vst1q_u16(s, s3);
+}
+
+static INLINE void store_8x6(uint16_t *s, const int p, const uint16x8_t s0,
+                             const uint16x8_t s1, const uint16x8_t s2,
+                             const uint16x8_t s3, const uint16x8_t s4,
+                             const uint16x8_t s5) {
+  vst1q_u16(s, s0);
+  s += p;
+  vst1q_u16(s, s1);
+  s += p;
+  vst1q_u16(s, s2);
+  s += p;
+  vst1q_u16(s, s3);
+  s += p;
+  vst1q_u16(s, s4);
+  s += p;
+  vst1q_u16(s, s5);
+}
+
+static INLINE void store_4x8(uint16_t *s, const int p, const uint16x8_t p1,
+                             const uint16x8_t p0, const uint16x8_t q0,
+                             const uint16x8_t q1) {
+  uint16x8x4_t o;
+
+  o.val[0] = p1;
+  o.val[1] = p0;
+  o.val[2] = q0;
+  o.val[3] = q1;
+  vst4q_lane_u16(s, o, 0);
+  s += p;
+  vst4q_lane_u16(s, o, 1);
+  s += p;
+  vst4q_lane_u16(s, o, 2);
+  s += p;
+  vst4q_lane_u16(s, o, 3);
+  s += p;
+  vst4q_lane_u16(s, o, 4);
+  s += p;
+  vst4q_lane_u16(s, o, 5);
+  s += p;
+  vst4q_lane_u16(s, o, 6);
+  s += p;
+  vst4q_lane_u16(s, o, 7);
+}
+
+static INLINE void store_6x8(uint16_t *s, const int p, const uint16x8_t s0,
+                             const uint16x8_t s1, const uint16x8_t s2,
+                             const uint16x8_t s3, const uint16x8_t s4,
+                             const uint16x8_t s5) {
+  uint16x8x3_t o0, o1;
+
+  o0.val[0] = s0;
+  o0.val[1] = s1;
+  o0.val[2] = s2;
+  o1.val[0] = s3;
+  o1.val[1] = s4;
+  o1.val[2] = s5;
+  vst3q_lane_u16(s - 3, o0, 0);
+  vst3q_lane_u16(s + 0, o1, 0);
+  s += p;
+  vst3q_lane_u16(s - 3, o0, 1);
+  vst3q_lane_u16(s + 0, o1, 1);
+  s += p;
+  vst3q_lane_u16(s - 3, o0, 2);
+  vst3q_lane_u16(s + 0, o1, 2);
+  s += p;
+  vst3q_lane_u16(s - 3, o0, 3);
+  vst3q_lane_u16(s + 0, o1, 3);
+  s += p;
+  vst3q_lane_u16(s - 3, o0, 4);
+  vst3q_lane_u16(s + 0, o1, 4);
+  s += p;
+  vst3q_lane_u16(s - 3, o0, 5);
+  vst3q_lane_u16(s + 0, o1, 5);
+  s += p;
+  vst3q_lane_u16(s - 3, o0, 6);
+  vst3q_lane_u16(s + 0, o1, 6);
+  s += p;
+  vst3q_lane_u16(s - 3, o0, 7);
+  vst3q_lane_u16(s + 0, o1, 7);
+}
+
+static INLINE void store_7x8(uint16_t *s, const int p, const uint16x8_t s0,
+                             const uint16x8_t s1, const uint16x8_t s2,
+                             const uint16x8_t s3, const uint16x8_t s4,
+                             const uint16x8_t s5, const uint16x8_t s6) {
+  uint16x8x4_t o0;
+  uint16x8x3_t o1;
+
+  o0.val[0] = s0;
+  o0.val[1] = s1;
+  o0.val[2] = s2;
+  o0.val[3] = s3;
+  o1.val[0] = s4;
+  o1.val[1] = s5;
+  o1.val[2] = s6;
+  vst4q_lane_u16(s - 4, o0, 0);
+  vst3q_lane_u16(s + 0, o1, 0);
+  s += p;
+  vst4q_lane_u16(s - 4, o0, 1);
+  vst3q_lane_u16(s + 0, o1, 1);
+  s += p;
+  vst4q_lane_u16(s - 4, o0, 2);
+  vst3q_lane_u16(s + 0, o1, 2);
+  s += p;
+  vst4q_lane_u16(s - 4, o0, 3);
+  vst3q_lane_u16(s + 0, o1, 3);
+  s += p;
+  vst4q_lane_u16(s - 4, o0, 4);
+  vst3q_lane_u16(s + 0, o1, 4);
+  s += p;
+  vst4q_lane_u16(s - 4, o0, 5);
+  vst3q_lane_u16(s + 0, o1, 5);
+  s += p;
+  vst4q_lane_u16(s - 4, o0, 6);
+  vst3q_lane_u16(s + 0, o1, 6);
+  s += p;
+  vst4q_lane_u16(s - 4, o0, 7);
+  vst3q_lane_u16(s + 0, o1, 7);
+}
+
+static INLINE void store_8x14(uint16_t *s, const int p, const uint16x8_t p6,
+                              const uint16x8_t p5, const uint16x8_t p4,
+                              const uint16x8_t p3, const uint16x8_t p2,
+                              const uint16x8_t p1, const uint16x8_t p0,
+                              const uint16x8_t q0, const uint16x8_t q1,
+                              const uint16x8_t q2, const uint16x8_t q3,
+                              const uint16x8_t q4, const uint16x8_t q5,
+                              const uint16x8_t q6, const uint32_t flat_status,
+                              const uint32_t flat2_status) {
+  if (flat_status) {
+    if (flat2_status) {
+      vst1q_u16(s - 7 * p, p6);
+      vst1q_u16(s - 6 * p, p5);
+      vst1q_u16(s - 5 * p, p4);
+      vst1q_u16(s - 4 * p, p3);
+      vst1q_u16(s + 3 * p, q3);
+      vst1q_u16(s + 4 * p, q4);
+      vst1q_u16(s + 5 * p, q5);
+      vst1q_u16(s + 6 * p, q6);
+    }
+    vst1q_u16(s - 3 * p, p2);
+    vst1q_u16(s + 2 * p, q2);
+  }
+  vst1q_u16(s - 2 * p, p1);
+  vst1q_u16(s - 1 * p, p0);
+  vst1q_u16(s + 0 * p, q0);
+  vst1q_u16(s + 1 * p, q1);
+}
+
+void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int p, const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      mask, hev;
+
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+                   q2, q3, &hev, &mask);
+  filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd);
+  store_8x4(s - 2 * p, p, p1, p0, q0, q1);
+}
+
+void vpx_highbd_lpf_horizontal_4_dual_neon(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_horizontal_4_neon(s, p, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_neon(s + 8, p, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      mask, hev;
+
+  load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1,
+                    (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1,
+                    (int16x8_t *)&q2, (int16x8_t *)&q3);
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+                   q2, q3, &hev, &mask);
+  filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd);
+  store_4x8(s - 2, p, p1, p0, q0, q1);
+}
+
+void vpx_highbd_lpf_vertical_4_dual_neon(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int p, const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+  uint32_t flat_status;
+
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+                              q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+  filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+          &op1, &op0, &oq0, &oq1, &oq2, bd);
+  store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_highbd_lpf_horizontal_8_dual_neon(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, bd);
+}
+
+void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+  uint32_t flat_status;
+
+  load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1,
+                    (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1,
+                    (int16x8_t *)&q2, (int16x8_t *)&q3);
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+                              q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+  filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+          &op1, &op0, &oq0, &oq1, &oq2, bd);
+  // Note: store_6x8() is faster than transpose + store_8x8().
+  store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_highbd_lpf_vertical_8_dual_neon(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
+}
+
+static void lpf_horizontal_16_kernel(uint16_t *s, int p,
+                                     const uint16x8_t blimit_vec,
+                                     const uint16x8_t limit_vec,
+                                     const uint16x8_t thresh_vec,
+                                     const int bd) {
+  uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2,
+      q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+      oq4, oq5, oq6;
+  uint32_t flat_status, flat2_status;
+
+  load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2,
+            &q3, &q4, &q5, &q6, &q7);
+  mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+                              q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+  flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,
+                     &flat2_status, bd);
+  filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4,
+           p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4,
+           &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
+           bd);
+  store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
+             oq5, oq6, flat_status, flat2_status);
+}
+
+static void lpf_vertical_16_kernel(uint16_t *s, int p,
+                                   const uint16x8_t blimit_vec,
+                                   const uint16x8_t limit_vec,
+                                   const uint16x8_t thresh_vec, const int bd) {
+  uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2,
+      q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+      oq4, oq5, oq6;
+  uint32_t flat_status, flat2_status;
+
+  load_8x8(s - 8, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0);
+  transpose_s16_8x8((int16x8_t *)&p7, (int16x8_t *)&p6, (int16x8_t *)&p5,
+                    (int16x8_t *)&p4, (int16x8_t *)&p3, (int16x8_t *)&p2,
+                    (int16x8_t *)&p1, (int16x8_t *)&p0);
+  load_8x8(s, p, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+  transpose_s16_8x8((int16x8_t *)&q0, (int16x8_t *)&q1, (int16x8_t *)&q2,
+                    (int16x8_t *)&q3, (int16x8_t *)&q4, (int16x8_t *)&q5,
+                    (int16x8_t *)&q6, (int16x8_t *)&q7);
+  mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
+                              q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
+  flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,
+                     &flat2_status, bd);
+  filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4,
+           p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4,
+           &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
+           bd);
+  if (flat_status) {
+    if (flat2_status) {
+      store_7x8(s - 3, p, op6, op5, op4, op3, op2, op1, op0);
+      store_7x8(s + 4, p, oq0, oq1, oq2, oq3, oq4, oq5, oq6);
+    } else {
+      // Note: store_6x8() is faster than transpose + store_8x8().
+      store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2);
+    }
+  } else {
+    store_4x8(s - 2, p, op1, op0, oq0, oq1);
+  }
+}
+
+void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh, int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec;
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
+
+void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int p,
+                                            const uint8_t *blimit,
+                                            const uint8_t *limit,
+                                            const uint8_t *thresh, int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec;
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+  lpf_horizontal_16_kernel(s + 8, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
+
+void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int p, const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec;
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
+
+void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int p,
+                                          const uint8_t *blimit,
+                                          const uint8_t *limit,
+                                          const uint8_t *thresh, int bd) {
+  uint16x8_t blimit_vec, limit_vec, thresh_vec;
+  load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
+  lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
+  lpf_vertical_16_kernel(s + 8 * p, p, blimit_vec, limit_vec, thresh_vec, bd);
+}
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
new file mode 100644
index 0000000000..c46c016312
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -0,0 +1,931 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p,
+                            int16x4_t *const s0, int16x4_t *const s1,
+                            int16x4_t *const s2, int16x4_t *const s3) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+}
+
+static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p,
+                            uint16x8_t *const s0, uint16x8_t *const s1,
+                            uint16x8_t *const s2, uint16x8_t *const s3) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+}
+
+static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p,
+                            int16x8_t *const s0, int16x8_t *const s1,
+                            int16x8_t *const s2, int16x8_t *const s3,
+                            int16x8_t *const s4, int16x8_t *const s5,
+                            int16x8_t *const s6, int16x8_t *const s7) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+}
+
+static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p,
+                             const uint16x8_t s0, const uint16x8_t s1,
+                             const uint16x8_t s2, const uint16x8_t s3,
+                             const uint16x8_t s4, const uint16x8_t s5,
+                             const uint16x8_t s6, const uint16x8_t s7) {
+  vst1q_u16(s, s0);
+  s += p;
+  vst1q_u16(s, s1);
+  s += p;
+  vst1q_u16(s, s2);
+  s += p;
+  vst1q_u16(s, s3);
+  s += p;
+  vst1q_u16(s, s4);
+  s += p;
+  vst1q_u16(s, s5);
+  s += p;
+  vst1q_u16(s, s6);
+  s += p;
+  vst1q_u16(s, s7);
+}
+
+static INLINE int32x4_t highbd_convolve8_4(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) {
+  const int16x4_t filters_lo = vget_low_s16(filters);
+  const int16x4_t filters_hi = vget_high_s16(filters);
+  int32x4_t sum;
+
+  sum = vmull_lane_s16(s0, filters_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, filters_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
+  return sum;
+}
+
+static INLINE uint16x8_t
+highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                   const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                   const int16x8_t s6, const int16x8_t s7,
+                   const int16x8_t filters, const uint16x8_t max) {
+  const int16x4_t filters_lo = vget_low_s16(filters);
+  const int16x4_t filters_hi = vget_high_s16(filters);
+  int32x4_t sum0, sum1;
+  uint16x8_t d;
+
+  sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
+  sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
+  d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7));
+  d = vminq_u16(d, max);
+  return d;
+}
+
+void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                     uint16_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
+                                     int w, int h, int bd) {
+  if (x_step_q4 != 16) {
+    vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+  } else {
+    const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+    uint16x8_t t0, t1, t2, t3;
+
+    assert(!((intptr_t)dst & 3));
+    assert(!(dst_stride & 3));
+
+    src -= 3;
+
+    if (h == 4) {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      int32x4_t d0, d1, d2, d3;
+      uint16x8_t d01, d23;
+
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u16_8x4(&t0, &t1, &t2, &t3);
+      s0 = vreinterpret_s16_u16(vget_low_u16(t0));
+      s1 = vreinterpret_s16_u16(vget_low_u16(t1));
+      s2 = vreinterpret_s16_u16(vget_low_u16(t2));
+      s3 = vreinterpret_s16_u16(vget_low_u16(t3));
+      s4 = vreinterpret_s16_u16(vget_high_u16(t0));
+      s5 = vreinterpret_s16_u16(vget_high_u16(t1));
+      s6 = vreinterpret_s16_u16(vget_high_u16(t2));
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(dst + 1 * dst_stride);
+      __builtin_prefetch(dst + 2 * dst_stride);
+      __builtin_prefetch(dst + 3 * dst_stride);
+      src += 7;
+
+      do {
+        load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
+        transpose_s16_4x4d(&s7, &s8, &s9, &s10);
+
+        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+        d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+        d01 = vminq_u16(d01, max);
+        d23 = vminq_u16(d23, max);
+        transpose_u16_4x4q(&d01, &d23);
+
+        vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
+        vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
+        vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
+        vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        src += 4;
+        dst += 4;
+        w -= 4;
+      } while (w > 0);
+    } else {
+      int16x8_t t4, t5, t6, t7;
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      uint16x8_t d0, d1, d2, d3;
+
+      if (w == 4) {
+        do {
+          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+                   &s5, &s6, &s7);
+          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+          load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
+                   &t4, &t5, &t6, &t7);
+          src += 8 * src_stride;
+          __builtin_prefetch(dst + 0 * dst_stride);
+          __builtin_prefetch(dst + 1 * dst_stride);
+          __builtin_prefetch(dst + 2 * dst_stride);
+          __builtin_prefetch(dst + 3 * dst_stride);
+          __builtin_prefetch(dst + 4 * dst_stride);
+          __builtin_prefetch(dst + 5 * dst_stride);
+          __builtin_prefetch(dst + 6 * dst_stride);
+          __builtin_prefetch(dst + 7 * dst_stride);
+          transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
+
+          __builtin_prefetch(src + 0 * src_stride);
+          __builtin_prefetch(src + 1 * src_stride);
+          __builtin_prefetch(src + 2 * src_stride);
+          __builtin_prefetch(src + 3 * src_stride);
+          __builtin_prefetch(src + 4 * src_stride);
+          __builtin_prefetch(src + 5 * src_stride);
+          __builtin_prefetch(src + 6 * src_stride);
+          __builtin_prefetch(src + 7 * src_stride);
+          d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+          d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+          d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+          d3 =
+              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+          transpose_u16_8x4(&d0, &d1, &d2, &d3);
+          vst1_u16(dst, vget_low_u16(d0));
+          dst += dst_stride;
+          vst1_u16(dst, vget_low_u16(d1));
+          dst += dst_stride;
+          vst1_u16(dst, vget_low_u16(d2));
+          dst += dst_stride;
+          vst1_u16(dst, vget_low_u16(d3));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d0));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d1));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d2));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d3));
+          dst += dst_stride;
+          h -= 8;
+        } while (h > 0);
+      } else {
+        int width;
+        const uint16_t *s;
+        uint16_t *d;
+        int16x8_t s11, s12, s13, s14;
+        uint16x8_t d4, d5, d6, d7;
+
+        do {
+          __builtin_prefetch(src + 0 * src_stride);
+          __builtin_prefetch(src + 1 * src_stride);
+          __builtin_prefetch(src + 2 * src_stride);
+          __builtin_prefetch(src + 3 * src_stride);
+          __builtin_prefetch(src + 4 * src_stride);
+          __builtin_prefetch(src + 5 * src_stride);
+          __builtin_prefetch(src + 6 * src_stride);
+          __builtin_prefetch(src + 7 * src_stride);
+          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+                   &s5, &s6, &s7);
+          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+          width = w;
+          s = src + 7;
+          d = dst;
+          __builtin_prefetch(dst + 0 * dst_stride);
+          __builtin_prefetch(dst + 1 * dst_stride);
+          __builtin_prefetch(dst + 2 * dst_stride);
+          __builtin_prefetch(dst + 3 * dst_stride);
+          __builtin_prefetch(dst + 4 * dst_stride);
+          __builtin_prefetch(dst + 5 * dst_stride);
+          __builtin_prefetch(dst + 6 * dst_stride);
+          __builtin_prefetch(dst + 7 * dst_stride);
+
+          do {
+            load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
+                     &s12, &s13, &s14);
+            transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
+
+            d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
+                                    max);
+            d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
+                                    max);
+            d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
+                                    max);
+            d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
+                                    max);
+            d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
+                                    max);
+            d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
+                                    max);
+            d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
+                                    max);
+            d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
+                                    filters, max);
+
+            transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+            store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+            s0 = s8;
+            s1 = s9;
+            s2 = s10;
+            s3 = s11;
+            s4 = s12;
+            s5 = s13;
+            s6 = s14;
+            s += 8;
+            d += 8;
+            width -= 8;
+          } while (width > 0);
+          src += 8 * src_stride;
+          dst += 8 * dst_stride;
+          h -= 8;
+        } while (h > 0);
+      }
+    }
+  }
+}
+
+void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
+                                         ptrdiff_t src_stride, uint16_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h, int bd) {
+  if (x_step_q4 != 16) {
+    vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                     bd);
+  } else {
+    const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+    uint16x8_t t0, t1, t2, t3;
+
+    assert(!((intptr_t)dst & 3));
+    assert(!(dst_stride & 3));
+
+    src -= 3;
+
+    if (h == 4) {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      int32x4_t d0, d1, d2, d3;
+      uint16x8_t d01, d23, t01, t23;
+
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u16_8x4(&t0, &t1, &t2, &t3);
+      s0 = vreinterpret_s16_u16(vget_low_u16(t0));
+      s1 = vreinterpret_s16_u16(vget_low_u16(t1));
+      s2 = vreinterpret_s16_u16(vget_low_u16(t2));
+      s3 = vreinterpret_s16_u16(vget_low_u16(t3));
+      s4 = vreinterpret_s16_u16(vget_high_u16(t0));
+      s5 = vreinterpret_s16_u16(vget_high_u16(t1));
+      s6 = vreinterpret_s16_u16(vget_high_u16(t2));
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(dst + 1 * dst_stride);
+      __builtin_prefetch(dst + 2 * dst_stride);
+      __builtin_prefetch(dst + 3 * dst_stride);
+      src += 7;
+
+      do {
+        load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
+        transpose_s16_4x4d(&s7, &s8, &s9, &s10);
+
+        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+        t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+        t01 = vminq_u16(t01, max);
+        t23 = vminq_u16(t23, max);
+        transpose_u16_4x4q(&t01, &t23);
+
+        d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
+                           vld1_u16(dst + 2 * dst_stride));
+        d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
+                           vld1_u16(dst + 3 * dst_stride));
+        d01 = vrhaddq_u16(d01, t01);
+        d23 = vrhaddq_u16(d23, t23);
+
+        vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
+        vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
+        vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
+        vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        src += 4;
+        dst += 4;
+        w -= 4;
+      } while (w > 0);
+    } else {
+      int16x8_t t4, t5, t6, t7;
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
+
+      if (w == 4) {
+        do {
+          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+                   &s5, &s6, &s7);
+          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+          load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
+                   &t4, &t5, &t6, &t7);
+          src += 8 * src_stride;
+          __builtin_prefetch(dst + 0 * dst_stride);
+          __builtin_prefetch(dst + 1 * dst_stride);
+          __builtin_prefetch(dst + 2 * dst_stride);
+          __builtin_prefetch(dst + 3 * dst_stride);
+          __builtin_prefetch(dst + 4 * dst_stride);
+          __builtin_prefetch(dst + 5 * dst_stride);
+          __builtin_prefetch(dst + 6 * dst_stride);
+          __builtin_prefetch(dst + 7 * dst_stride);
+          transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
+
+          __builtin_prefetch(src + 0 * src_stride);
+          __builtin_prefetch(src + 1 * src_stride);
+          __builtin_prefetch(src + 2 * src_stride);
+          __builtin_prefetch(src + 3 * src_stride);
+          __builtin_prefetch(src + 4 * src_stride);
+          __builtin_prefetch(src + 5 * src_stride);
+          __builtin_prefetch(src + 6 * src_stride);
+          __builtin_prefetch(src + 7 * src_stride);
+          t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+          t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+          t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+          t3 =
+              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+          transpose_u16_8x4(&t0, &t1, &t2, &t3);
+
+          d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
+                            vld1_u16(dst + 4 * dst_stride));
+          d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
+                            vld1_u16(dst + 5 * dst_stride));
+          d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
+                            vld1_u16(dst + 6 * dst_stride));
+          d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride),
+                            vld1_u16(dst + 7 * dst_stride));
+          d0 = vrhaddq_u16(d0, t0);
+          d1 = vrhaddq_u16(d1, t1);
+          d2 = vrhaddq_u16(d2, t2);
+          d3 = vrhaddq_u16(d3, t3);
+
+          vst1_u16(dst, vget_low_u16(d0));
+          dst += dst_stride;
+          vst1_u16(dst, vget_low_u16(d1));
+          dst += dst_stride;
+          vst1_u16(dst, vget_low_u16(d2));
+          dst += dst_stride;
+          vst1_u16(dst, vget_low_u16(d3));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d0));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d1));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d2));
+          dst += dst_stride;
+          vst1_u16(dst, vget_high_u16(d3));
+          dst += dst_stride;
+          h -= 8;
+        } while (h > 0);
+      } else {
+        int width;
+        const uint16_t *s;
+        uint16_t *d;
+        int16x8_t s11, s12, s13, s14;
+        uint16x8_t d4, d5, d6, d7;
+
+        do {
+          __builtin_prefetch(src + 0 * src_stride);
+          __builtin_prefetch(src + 1 * src_stride);
+          __builtin_prefetch(src + 2 * src_stride);
+          __builtin_prefetch(src + 3 * src_stride);
+          __builtin_prefetch(src + 4 * src_stride);
+          __builtin_prefetch(src + 5 * src_stride);
+          __builtin_prefetch(src + 6 * src_stride);
+          __builtin_prefetch(src + 7 * src_stride);
+          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
+                   &s5, &s6, &s7);
+          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+          width = w;
+          s = src + 7;
+          d = dst;
+          __builtin_prefetch(dst + 0 * dst_stride);
+          __builtin_prefetch(dst + 1 * dst_stride);
+          __builtin_prefetch(dst + 2 * dst_stride);
+          __builtin_prefetch(dst + 3 * dst_stride);
+          __builtin_prefetch(dst + 4 * dst_stride);
+          __builtin_prefetch(dst + 5 * dst_stride);
+          __builtin_prefetch(dst + 6 * dst_stride);
+          __builtin_prefetch(dst + 7 * dst_stride);
+
+          do {
+            load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
+                     &s12, &s13, &s14);
+            transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
+
+            d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
+                                    max);
+            d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
+                                    max);
+            d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
+                                    max);
+            d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
+                                    max);
+            d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
+                                    max);
+            d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
+                                    max);
+            d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
+                                    max);
+            d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
+                                    filters, max);
+
+            transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+            d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+            d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+            d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+            d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+            d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride));
+            d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride));
+            d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride));
+            d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride));
+
+            store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+            s0 = s8;
+            s1 = s9;
+            s2 = s10;
+            s3 = s11;
+            s4 = s12;
+            s5 = s13;
+            s6 = s14;
+            s += 8;
+            d += 8;
+            width -= 8;
+          } while (width > 0);
+          src += 8 * src_stride;
+          dst += 8 * dst_stride;
+          h -= 8;
+        } while (h > 0);
+      }
+    }
+  }
+}
+
+void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                    uint16_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h, int bd) {
+  if (y_step_q4 != 16) {
+    vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                                x_step_q4, y0_q4, y_step_q4, w, h, bd);
+  } else {
+    const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    assert(!((intptr_t)dst & 3));
+    assert(!(dst_stride & 3));
+
+    src -= 3 * src_stride;
+
+    if (w == 4) {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      int32x4_t d0, d1, d2, d3;
+      uint16x8_t d01, d23;
+
+      s0 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s1 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s2 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s3 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s4 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s5 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s6 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+
+      do {
+        s7 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+        s8 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+        s9 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+        s10 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+        d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+        d01 = vminq_u16(d01, max);
+        d23 = vminq_u16(d23, max);
+        vst1_u16(dst, vget_low_u16(d01));
+        dst += dst_stride;
+        vst1_u16(dst, vget_high_u16(d01));
+        dst += dst_stride;
+        vst1_u16(dst, vget_low_u16(d23));
+        dst += dst_stride;
+        vst1_u16(dst, vget_high_u16(d23));
+        dst += dst_stride;
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        h -= 4;
+      } while (h > 0);
+    } else {
+      int height;
+      const uint16_t *s;
+      uint16_t *d;
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      uint16x8_t d0, d1, d2, d3;
+
+      do {
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        s = src;
+        s0 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s1 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s2 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s3 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s4 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s5 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s6 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        d = dst;
+        height = h;
+
+        do {
+          s7 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+          s8 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+          s9 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+          s10 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+
+          __builtin_prefetch(d + 0 * dst_stride);
+          __builtin_prefetch(d + 1 * dst_stride);
+          __builtin_prefetch(d + 2 * dst_stride);
+          __builtin_prefetch(d + 3 * dst_stride);
+          __builtin_prefetch(s + 0 * src_stride);
+          __builtin_prefetch(s + 1 * src_stride);
+          __builtin_prefetch(s + 2 * src_stride);
+          __builtin_prefetch(s + 3 * src_stride);
+          d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+          d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+          d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+          d3 =
+              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+          vst1q_u16(d, d0);
+          d += dst_stride;
+          vst1q_u16(d, d1);
+          d += dst_stride;
+          vst1q_u16(d, d2);
+          d += dst_stride;
+          vst1q_u16(d, d3);
+          d += dst_stride;
+
+          s0 = s4;
+          s1 = s5;
+          s2 = s6;
+          s3 = s7;
+          s4 = s8;
+          s5 = s9;
+          s6 = s10;
+          height -= 4;
+        } while (height > 0);
+        src += 8;
+        dst += 8;
+        w -= 8;
+      } while (w > 0);
+    }
+  }
+}
+
+void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
+                                        ptrdiff_t src_stride, uint16_t *dst,
+                                        ptrdiff_t dst_stride,
+                                        const InterpKernel *filter, int x0_q4,
+                                        int x_step_q4, int y0_q4, int y_step_q4,
+                                        int w, int h, int bd) {
+  if (y_step_q4 != 16) {
+    vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                    bd);
+  } else {
+    const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    assert(!((intptr_t)dst & 3));
+    assert(!(dst_stride & 3));
+
+    src -= 3 * src_stride;
+
+    if (w == 4) {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      int32x4_t d0, d1, d2, d3;
+      uint16x8_t d01, d23, t01, t23;
+
+      s0 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s1 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s2 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s3 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s4 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s5 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+      s6 = vreinterpret_s16_u16(vld1_u16(src));
+      src += src_stride;
+
+      do {
+        s7 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+        s8 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+        s9 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+        s10 = vreinterpret_s16_u16(vld1_u16(src));
+        src += src_stride;
+
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
+        t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
+        t01 = vminq_u16(t01, max);
+        t23 = vminq_u16(t23, max);
+
+        d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
+                           vld1_u16(dst + 1 * dst_stride));
+        d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
+                           vld1_u16(dst + 3 * dst_stride));
+        d01 = vrhaddq_u16(d01, t01);
+        d23 = vrhaddq_u16(d23, t23);
+
+        vst1_u16(dst, vget_low_u16(d01));
+        dst += dst_stride;
+        vst1_u16(dst, vget_high_u16(d01));
+        dst += dst_stride;
+        vst1_u16(dst, vget_low_u16(d23));
+        dst += dst_stride;
+        vst1_u16(dst, vget_high_u16(d23));
+        dst += dst_stride;
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        h -= 4;
+      } while (h > 0);
+    } else {
+      int height;
+      const uint16_t *s;
+      uint16_t *d;
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+      uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
+
+      do {
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        s = src;
+        s0 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s1 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s2 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s3 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s4 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s5 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        s6 = vreinterpretq_s16_u16(vld1q_u16(s));
+        s += src_stride;
+        d = dst;
+        height = h;
+
+        do {
+          s7 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+          s8 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+          s9 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+          s10 = vreinterpretq_s16_u16(vld1q_u16(s));
+          s += src_stride;
+
+          __builtin_prefetch(d + 0 * dst_stride);
+          __builtin_prefetch(d + 1 * dst_stride);
+          __builtin_prefetch(d + 2 * dst_stride);
+          __builtin_prefetch(d + 3 * dst_stride);
+          __builtin_prefetch(s + 0 * src_stride);
+          __builtin_prefetch(s + 1 * src_stride);
+          __builtin_prefetch(s + 2 * src_stride);
+          __builtin_prefetch(s + 3 * src_stride);
+          t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+          t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+          t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+          t3 =
+              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+          d0 = vld1q_u16(d + 0 * dst_stride);
+          d1 = vld1q_u16(d + 1 * dst_stride);
+          d2 = vld1q_u16(d + 2 * dst_stride);
+          d3 = vld1q_u16(d + 3 * dst_stride);
+          d0 = vrhaddq_u16(d0, t0);
+          d1 = vrhaddq_u16(d1, t1);
+          d2 = vrhaddq_u16(d2, t2);
+          d3 = vrhaddq_u16(d3, t3);
+
+          vst1q_u16(d, d0);
+          d += dst_stride;
+          vst1q_u16(d, d1);
+          d += dst_stride;
+          vst1q_u16(d, d2);
+          d += dst_stride;
+          vst1q_u16(d, d3);
+          d += dst_stride;
+
+          s0 = s4;
+          s1 = s5;
+          s2 = s6;
+          s3 = s7;
+          s4 = s8;
+          s5 = s9;
+          s6 = s10;
+          height -= 4;
+        } while (height > 0);
+        src += 8;
+        dst += 8;
+        w -= 8;
+      } while (w > 0);
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
new file mode 100644
index 0000000000..765a054f8d
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
@@ -0,0 +1,183 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                  uint16_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h, int bd) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+  (void)bd;
+
+  if (w < 8) {  // avg4
+    uint16x4_t s0, s1, d0, d1;
+    uint16x8_t s01, d01;
+    do {
+      s0 = vld1_u16(src);
+      d0 = vld1_u16(dst);
+      src += src_stride;
+      s1 = vld1_u16(src);
+      d1 = vld1_u16(dst + dst_stride);
+      src += src_stride;
+      s01 = vcombine_u16(s0, s1);
+      d01 = vcombine_u16(d0, d1);
+      d01 = vrhaddq_u16(s01, d01);
+      vst1_u16(dst, vget_low_u16(d01));
+      dst += dst_stride;
+      vst1_u16(dst, vget_high_u16(d01));
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (w == 8) {  // avg8
+    uint16x8_t s0, s1, d0, d1;
+    do {
+      s0 = vld1q_u16(src);
+      d0 = vld1q_u16(dst);
+      src += src_stride;
+      s1 = vld1q_u16(src);
+      d1 = vld1q_u16(dst + dst_stride);
+      src += src_stride;
+
+      d0 = vrhaddq_u16(s0, d0);
+      d1 = vrhaddq_u16(s1, d1);
+
+      vst1q_u16(dst, d0);
+      dst += dst_stride;
+      vst1q_u16(dst, d1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (w < 32) {  // avg16
+    uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h;
+    do {
+      s0l = vld1q_u16(src);
+      s0h = vld1q_u16(src + 8);
+      d0l = vld1q_u16(dst);
+      d0h = vld1q_u16(dst + 8);
+      src += src_stride;
+      s1l = vld1q_u16(src);
+      s1h = vld1q_u16(src + 8);
+      d1l = vld1q_u16(dst + dst_stride);
+      d1h = vld1q_u16(dst + dst_stride + 8);
+      src += src_stride;
+
+      d0l = vrhaddq_u16(s0l, d0l);
+      d0h = vrhaddq_u16(s0h, d0h);
+      d1l = vrhaddq_u16(s1l, d1l);
+      d1h = vrhaddq_u16(s1h, d1h);
+
+      vst1q_u16(dst, d0l);
+      vst1q_u16(dst + 8, d0h);
+      dst += dst_stride;
+      vst1q_u16(dst, d1l);
+      vst1q_u16(dst + 8, d1h);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (w == 32) {  // avg32
+    uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
+    do {
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
+      d0 = vld1q_u16(dst);
+      d1 = vld1q_u16(dst + 8);
+      d2 = vld1q_u16(dst + 16);
+      d3 = vld1q_u16(dst + 24);
+      src += src_stride;
+
+      d0 = vrhaddq_u16(s0, d0);
+      d1 = vrhaddq_u16(s1, d1);
+      d2 = vrhaddq_u16(s2, d2);
+      d3 = vrhaddq_u16(s3, d3);
+
+      vst1q_u16(dst, d0);
+      vst1q_u16(dst + 8, d1);
+      vst1q_u16(dst + 16, d2);
+      vst1q_u16(dst + 24, d3);
+      dst += dst_stride;
+
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
+      d0 = vld1q_u16(dst);
+      d1 = vld1q_u16(dst + 8);
+      d2 = vld1q_u16(dst + 16);
+      d3 = vld1q_u16(dst + 24);
+      src += src_stride;
+
+      d0 = vrhaddq_u16(s0, d0);
+      d1 = vrhaddq_u16(s1, d1);
+      d2 = vrhaddq_u16(s2, d2);
+      d3 = vrhaddq_u16(s3, d3);
+
+      vst1q_u16(dst, d0);
+      vst1q_u16(dst + 8, d1);
+      vst1q_u16(dst + 16, d2);
+      vst1q_u16(dst + 24, d3);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {  // avg64
+    uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
+    do {
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
+      d0 = vld1q_u16(dst);
+      d1 = vld1q_u16(dst + 8);
+      d2 = vld1q_u16(dst + 16);
+      d3 = vld1q_u16(dst + 24);
+
+      d0 = vrhaddq_u16(s0, d0);
+      d1 = vrhaddq_u16(s1, d1);
+      d2 = vrhaddq_u16(s2, d2);
+      d3 = vrhaddq_u16(s3, d3);
+
+      vst1q_u16(dst, d0);
+      vst1q_u16(dst + 8, d1);
+      vst1q_u16(dst + 16, d2);
+      vst1q_u16(dst + 24, d3);
+
+      s0 = vld1q_u16(src + 32);
+      s1 = vld1q_u16(src + 40);
+      s2 = vld1q_u16(src + 48);
+      s3 = vld1q_u16(src + 56);
+      d0 = vld1q_u16(dst + 32);
+      d1 = vld1q_u16(dst + 40);
+      d2 = vld1q_u16(dst + 48);
+      d3 = vld1q_u16(dst + 56);
+
+      d0 = vrhaddq_u16(s0, d0);
+      d1 = vrhaddq_u16(s1, d1);
+      d2 = vrhaddq_u16(s2, d2);
+      d3 = vrhaddq_u16(s3, d3);
+
+      vst1q_u16(dst + 32, d0);
+      vst1q_u16(dst + 40, d1);
+      vst1q_u16(dst + 48, d2);
+      vst1q_u16(dst + 56, d3);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h);
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
new file mode 100644
index 0000000000..9d2752e097
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h, int bd) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+  (void)bd;
+
+  if (w < 8) {  // copy4
+    do {
+      vst1_u16(dst, vld1_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst1_u16(dst, vld1_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (w == 8) {  // copy8
+    do {
+      vst1q_u16(dst, vld1q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst1q_u16(dst, vld1q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (w < 32) {  // copy16
+    do {
+      vst2q_u16(dst, vld2q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst2q_u16(dst, vld2q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst2q_u16(dst, vld2q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst2q_u16(dst, vld2q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else if (w == 32) {  // copy32
+    do {
+      vst4q_u16(dst, vld4q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {  // copy64
+    do {
+      vst4q_u16(dst, vld4q_u16(src));
+      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      src += src_stride;
+      dst += dst_stride;
+      vst4q_u16(dst, vld4q_u16(src));
+      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
new file mode 100644
index 0000000000..414ade3530
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
+                               uint16_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
+                               int h, int bd) {
+  // + 1 to make it divisible by 4
+  uint16_t temp[64 * 136];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  /* Filter starting 3 lines back. The neon implementation will ignore the given
+   * height and filter a multiple of 4 lines. Since this goes in to the temp
+   * buffer which has lots of extra room and is subsequently discarded this is
+   * safe if somewhat less than ideal.   */
+  vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
+                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                                  intermediate_height, bd);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data */
+  vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h, int bd) {
+  // + 1 to make it divisible by 4
+  uint16_t temp[64 * 136];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  /* This implementation has the same issues as above. In addition, we only want
+   * to average the values after both passes.
+   */
+  vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
+                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                                  intermediate_height, bd);
+  vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
+                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                     bd);
+}
diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
deleted file mode 100644
index dc459e20d9..0000000000
--- a/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
+++ /dev/null
@@ -1,198 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vpx_idct16x16_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
-;                                    int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vpx_idct16x16_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 6)
-    add              r0, r0, #32               ; + (1 <<((6) - 1))
-    asr              r0, r0, #6                ; >> 6
-
-    vdup.s16         q0, r0                    ; duplicate a1
-    mov              r0, #8
-    sub              r2, #8
-
-    ; load destination data row0 - row3
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row4 - row7
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row8 - row11
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row12 - row15
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    bx               lr
-    ENDP             ; |vpx_idct16x16_1_add_neon|
-
-    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
index 466b408893..bf5192a683 100644
--- a/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
@@ -10,49 +10,68 @@
 
 #include <arm_neon.h>
 
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
 
-void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x8_t d2u8, d3u8, d30u8, d31u8;
-  uint64x1_t d2u64, d3u64, d4u64, d5u64;
-  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
-  int16x8_t q0s16;
-  uint8_t *d1, *d2;
-  int16_t i, j, a1, cospi_16_64 = 11585;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  q0s16 = vdupq_n_s16(a1);
-  q0u16 = vreinterpretq_u16_s16(q0s16);
-
-  for (d1 = d2 = dest, i = 0; i < 4; i++) {
-    for (j = 0; j < 2; j++) {
-      d2u64 = vld1_u64((const uint64_t *)d1);
-      d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
-      d1 += dest_stride;
-      d4u64 = vld1_u64((const uint64_t *)d1);
-      d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
-      d1 += dest_stride;
-
-      q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
-      q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
-      q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
-      q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
-      d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-      d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-      d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-      d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
-      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
-      d2 += dest_stride;
-      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
-      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
-      d2 += dest_stride;
-    }
-  }
-  return;
+static INLINE void idct16x16_1_add_pos_kernel(uint8_t **dest, const int stride,
+                                              const uint8x16_t res) {
+  const uint8x16_t a = vld1q_u8(*dest);
+  const uint8x16_t b = vqaddq_u8(a, res);
+  vst1q_u8(*dest, b);
+  *dest += stride;
+}
+
+static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride,
+                                              const uint8x16_t res) {
+  const uint8x16_t a = vld1q_u8(*dest);
+  const uint8x16_t b = vqsubq_u8(a, res);
+  vst1q_u8(*dest, b);
+  *dest += stride;
+}
+
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
+  const int16_t out0 =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+  const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+
+  if (a1 >= 0) {
+    const uint8x16_t dc = create_dcq(a1);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+    idct16x16_1_add_pos_kernel(&dest, stride, dc);
+  } else {
+    const uint8x16_t dc = create_dcq(-a1);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    idct16x16_1_add_neg_kernel(&dest, stride, dc);
+  }
 }
diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
deleted file mode 100644
index 22a0c95941..0000000000
--- a/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
+++ /dev/null
@@ -1,1179 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vpx_idct16x16_256_add_neon_pass1|
-    EXPORT  |vpx_idct16x16_256_add_neon_pass2|
-    EXPORT  |vpx_idct16x16_10_add_neon_pass1|
-    EXPORT  |vpx_idct16x16_10_add_neon_pass2|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void |vpx_idct16x16_256_add_neon_pass1|(int16_t *input,
-;                                          int16_t *output, int output_stride)
-;
-; r0  int16_t input
-; r1  int16_t *output
-; r2  int  output_stride)
-
-; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vpx_idct16x16_256_add_neon_pass1| PROC
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q1,q2}, [r0]!
-    vmov.s16        q15, q1
-
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0xc00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r12, #0x3e00
-    add             r12, #0xc5
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r12                   ; duplicate cospi_4_64
-
-    ; preloading to avoid stall
-    ; generate cospi_12_64 = 13623
-    mov             r3, #0x3500
-    add             r3, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r12, #0x2300
-    add             r12, #0x8e
-
-    ; step2[4] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; step2[4] * cospi_4_64
-    vmull.s16       q5, d18, d1
-    vmull.s16       q6, d19, d1
-
-    ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64
-    vmlal.s16       q5, d30, d0
-    vmlal.s16       q6, d31, d0
-
-    vdup.16         d2, r3                    ; duplicate cospi_12_64
-    vdup.16         d3, r12                   ; duplicate cospi_20_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d14, q5, #14              ; >> 14
-    vqrshrn.s32     d15, q6, #14              ; >> 14
-
-    ; preloading to avoid stall
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r12, #0x1800
-    add             r12, #0x7e
-
-    ; step2[5] * cospi_12_64
-    vmull.s16       q2, d26, d2
-    vmull.s16       q3, d27, d2
-
-    ; step2[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q15, d27, d3
-
-    ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q2, d22, d3
-    vmlsl.s16       q3, d23, d3
-
-    ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q15, d23, d2
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d10, q2, #14              ; >> 14
-    vqrshrn.s32     d11, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q15, #14             ; >> 14
-
-    ; stage 4
-    vdup.16         d30, r3                   ; cospi_16_64
-
-    ; step1[0] * cospi_16_64
-    vmull.s16       q2, d16, d30
-    vmull.s16       q11, d17, d30
-
-    ; step1[1] * cospi_16_64
-    vmull.s16       q0, d24, d30
-    vmull.s16       q1, d25, d30
-
-    ; generate cospi_8_64 = 15137
-    mov             r3, #0x3b00
-    add             r3, #0x21
-
-    vdup.16         d30, r12                  ; duplicate cospi_24_64
-    vdup.16         d31, r3                   ; duplicate cospi_8_64
-
-    ; temp1 = (step1[0] + step1[1]) * cospi_16_64
-    vadd.s32        q3, q2, q0
-    vadd.s32        q12, q11, q1
-
-    ; temp2 = (step1[0] - step1[1]) * cospi_16_64
-    vsub.s32        q13, q2, q0
-    vsub.s32        q1, q11, q1
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d16, q3, #14              ; >> 14
-    vqrshrn.s32     d17, q12, #14             ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d18, q13, #14             ; >> 14
-    vqrshrn.s32     d19, q1, #14              ; >> 14
-
-    ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-    ; step1[2] * cospi_8_64
-    vmull.s16       q0, d20, d31
-    vmull.s16       q1, d21, d31
-
-    ; step1[2] * cospi_24_64
-    vmull.s16       q12, d20, d30
-    vmull.s16       q13, d21, d30
-
-    ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q0, d28, d30
-    vmlal.s16       q1, d29, d30
-
-    ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q12, d28, d31
-    vmlsl.s16       q13, d29, d31
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d22, q0, #14              ; >> 14
-    vqrshrn.s32     d23, q1, #14              ; >> 14
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d20, q12, #14             ; >> 14
-    vqrshrn.s32     d21, q13, #14             ; >> 14
-
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5];
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5];
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7];
-    vadd.s16        q15, q6, q7               ; step2[7] = step1[6] + step1[7];
-
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
-
-    ; stage 5
-    vadd.s16        q0, q8, q11               ; step1[0] = step2[0] + step2[3];
-    vadd.s16        q1, q9, q10               ; step1[1] = step2[1] + step2[2];
-    vsub.s16        q2, q9, q10               ; step1[2] = step2[1] - step2[2];
-    vsub.s16        q3, q8, q11               ; step1[3] = step2[0] - step2[3];
-
-    vdup.16         d16, r3;                  ; duplicate cospi_16_64
-
-    ; step2[5] * cospi_16_64
-    vmull.s16       q11, d26, d16
-    vmull.s16       q12, d27, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; temp1 = (step2[6] - step2[5]) * cospi_16_64
-    vsub.s32        q6, q9, q11
-    vsub.s32        q13, q10, q12
-
-    ; temp2 = (step2[5] + step2[6]) * cospi_16_64
-    vadd.s32        q9, q9, q11
-    vadd.s32        q10, q10, q12
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d10, q6, #14              ; >> 14
-    vqrshrn.s32     d11, q13, #14             ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q10, #14             ; >> 14
-
-    ; stage 6
-    vadd.s16        q8, q0, q15                ; step2[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; step2[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; step2[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; step2[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; step2[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; step2[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; step2[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q15              ; step2[7] = step1[0] - step1[7];
-
-    ; store the data
-    vst1.64         {d16}, [r1], r2
-    vst1.64         {d17}, [r1], r2
-    vst1.64         {d18}, [r1], r2
-    vst1.64         {d19}, [r1], r2
-    vst1.64         {d20}, [r1], r2
-    vst1.64         {d21}, [r1], r2
-    vst1.64         {d22}, [r1], r2
-    vst1.64         {d23}, [r1], r2
-    vst1.64         {d24}, [r1], r2
-    vst1.64         {d25}, [r1], r2
-    vst1.64         {d26}, [r1], r2
-    vst1.64         {d27}, [r1], r2
-    vst1.64         {d28}, [r1], r2
-    vst1.64         {d29}, [r1], r2
-    vst1.64         {d30}, [r1], r2
-    vst1.64         {d31}, [r1], r2
-
-    bx              lr
-    ENDP  ; |vpx_idct16x16_256_add_neon_pass1|
-
-;void vpx_idct16x16_256_add_neon_pass2(int16_t *src,
-;                                        int16_t *output,
-;                                        int16_t *pass1Output,
-;                                        int16_t skip_adding,
-;                                        uint8_t *dest,
-;                                        int dest_stride)
-;
-; r0  int16_t *src
-; r1  int16_t *output,
-; r2  int16_t *pass1Output,
-; r3  int16_t skip_adding,
-; r4  uint8_t *dest,
-; r5  int dest_stride)
-
-; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vpx_idct16x16_256_add_neon_pass2| PROC
-    push            {r3-r9}
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q0,q1}, [r0]!
-    vmov.s16        q15, q0;
-
-    ; generate  cospi_30_64 = 1606
-    mov             r3, #0x0600
-    add             r3, #0x46
-
-    ; generate cospi_2_64  = 16305
-    mov             r12, #0x3f00
-    add             r12, #0xb1
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         d12, r3                   ; duplicate cospi_30_64
-    vdup.16         d13, r12                  ; duplicate cospi_2_64
-
-    ; preloading to avoid stall
-    ; generate cospi_14_64 = 12665
-    mov             r3, #0x3100
-    add             r3, #0x79
-
-    ; generate cospi_18_64 = 10394
-    mov             r12, #0x2800
-    add             r12, #0x9a
-
-    ; step1[8] * cospi_30_64
-    vmull.s16       q2, d16, d12
-    vmull.s16       q3, d17, d12
-
-    ; step1[8] * cospi_2_64
-    vmull.s16       q1, d16, d13
-    vmull.s16       q4, d17, d13
-
-    ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64
-    vmlsl.s16       q2, d30, d13
-    vmlsl.s16       q3, d31, d13
-
-    ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64
-    vmlal.s16       q1, d30, d12
-    vmlal.s16       q4, d31, d12
-
-    vdup.16         d30, r3                   ; duplicate cospi_14_64
-    vdup.16         d31, r12                  ; duplicate cospi_18_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d0, q2, #14               ; >> 14
-    vqrshrn.s32     d1, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d14, q1, #14              ; >> 14
-    vqrshrn.s32     d15, q4, #14              ; >> 14
-
-    ; preloading to avoid stall
-    ; generate cospi_22_64 = 7723
-    mov             r3, #0x1e00
-    add             r3, #0x2b
-
-    ; generate cospi_10_64 = 14449
-    mov             r12, #0x3800
-    add             r12, #0x71
-
-    ; step1[9] * cospi_14_64
-    vmull.s16       q2, d24, d30
-    vmull.s16       q3, d25, d30
-
-    ; step1[9] * cospi_18_64
-    vmull.s16       q4, d24, d31
-    vmull.s16       q5, d25, d31
-
-    ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64
-    vmlsl.s16       q2, d22, d31
-    vmlsl.s16       q3, d23, d31
-
-    ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64
-    vmlal.s16       q4, d22, d30
-    vmlal.s16       q5, d23, d30
-
-    vdup.16         d30, r3                   ; duplicate cospi_22_64
-    vdup.16         d31, r12                  ; duplicate cospi_10_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d2, q2, #14               ; >> 14
-    vqrshrn.s32     d3, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q4, #14              ; >> 14
-    vqrshrn.s32     d13, q5, #14              ; >> 14
-
-    ; step1[10] * cospi_22_64
-    vmull.s16       q11, d20, d30
-    vmull.s16       q12, d21, d30
-
-    ; step1[10] * cospi_10_64
-    vmull.s16       q4, d20, d31
-    vmull.s16       q5, d21, d31
-
-    ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64
-    vmlsl.s16       q11, d26, d31
-    vmlsl.s16       q12, d27, d31
-
-    ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64
-    vmlal.s16       q4, d26, d30
-    vmlal.s16       q5, d27, d30
-
-    ; preloading to avoid stall
-    ; generate cospi_6_64 = 15679
-    mov             r3, #0x3d00
-    add             r3, #0x3f
-
-    ; generate cospi_26_64 = 4756
-    mov             r12, #0x1200
-    add             r12, #0x94
-
-    vdup.16         d30, r3                   ; duplicate cospi_6_64
-    vdup.16         d31, r12                  ; duplicate cospi_26_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q11, #14              ; >> 14
-    vqrshrn.s32     d5, q12, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d11, q5, #14              ; >> 14
-    vqrshrn.s32     d10, q4, #14              ; >> 14
-
-    ; step1[11] * cospi_6_64
-    vmull.s16       q10, d28, d30
-    vmull.s16       q11, d29, d30
-
-    ; step1[11] * cospi_26_64
-    vmull.s16       q12, d28, d31
-    vmull.s16       q13, d29, d31
-
-    ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64
-    vmlsl.s16       q10, d18, d31
-    vmlsl.s16       q11, d19, d31
-
-    ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64
-    vmlal.s16       q12, d18, d30
-    vmlal.s16       q13, d19, d30
-
-    vsub.s16        q9, q0, q1                ; step1[9]=step2[8]-step2[9]
-    vadd.s16        q0, q0, q1                ; step1[8]=step2[8]+step2[9]
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d6, q10, #14              ; >> 14
-    vqrshrn.s32     d7, q11, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d8, q12, #14              ; >> 14
-    vqrshrn.s32     d9, q13, #14              ; >> 14
-
-    ; stage 3
-    vsub.s16        q10, q3, q2               ; step1[10]=-step2[10]+step2[11]
-    vadd.s16        q11, q2, q3               ; step1[11]=step2[10]+step2[11]
-    vadd.s16        q12, q4, q5               ; step1[12]=step2[12]+step2[13]
-    vsub.s16        q13, q4, q5               ; step1[13]=step2[12]-step2[13]
-    vsub.s16        q14, q7, q6               ; step1[14]=-step2[14]+tep2[15]
-    vadd.s16        q7, q6, q7                ; step1[15]=step2[14]+step2[15]
-
-    ; stage 4
-    ; generate cospi_24_64 = 6270
-    mov             r3, #0x1800
-    add             r3, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r12, #0x3b00
-    add             r12, #0x21
-
-    ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
-    vdup.16         d30, r12                  ; duplicate cospi_8_64
-    vdup.16         d31, r3                   ; duplicate cospi_24_64
-
-    ; step1[9] * cospi_24_64
-    vmull.s16       q2, d18, d31
-    vmull.s16       q3, d19, d31
-
-    ; step1[14] * cospi_24_64
-    vmull.s16       q4, d28, d31
-    vmull.s16       q5, d29, d31
-
-    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
-    vmlal.s16       q2, d28, d30
-    vmlal.s16       q3, d29, d30
-
-    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
-    vmlsl.s16       q4, d18, d30
-    vmlsl.s16       q5, d19, d30
-
-    rsb             r12, #0
-    vdup.16         d30, r12                  ; duplicate -cospi_8_64
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q2, #14              ; >> 14
-    vqrshrn.s32     d13, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d2, q4, #14               ; >> 14
-    vqrshrn.s32     d3, q5, #14               ; >> 14
-
-    vmov.s16        q3, q11
-    vmov.s16        q4, q12
-
-    ; - step1[13] * cospi_8_64
-    vmull.s16       q11, d26, d30
-    vmull.s16       q12, d27, d30
-
-    ; -step1[10] * cospi_8_64
-    vmull.s16       q8, d20, d30
-    vmull.s16       q9, d21, d30
-
-    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
-    vmlsl.s16       q11, d20, d31
-    vmlsl.s16       q12, d21, d31
-
-    ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
-    vmlal.s16       q8, d26, d31
-    vmlal.s16       q9, d27, d31
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d4, q11, #14              ; >> 14
-    vqrshrn.s32     d5, q12, #14              ; >> 14
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d10, q8, #14              ; >> 14
-    vqrshrn.s32     d11, q9, #14              ; >> 14
-
-    ; stage 5
-    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
-    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
-    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
-    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
-    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
-    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
-    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
-    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
-
-    ; stage 6.
-    ; generate cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
-
-    vdup.16         d14, r12                  ; duplicate cospi_16_64
-
-    ; step1[13] * cospi_16_64
-    vmull.s16       q3, d26, d14
-    vmull.s16       q4, d27, d14
-
-    ; step1[10] * cospi_16_64
-    vmull.s16       q0, d20, d14
-    vmull.s16       q1, d21, d14
-
-    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
-    vsub.s32        q5, q3, q0
-    vsub.s32        q6, q4, q1
-
-    ; temp2 = (step1[10] + step1[13]) * cospi_16_64
-    vadd.s32        q10, q3, q0
-    vadd.s32        q4, q4, q1
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q5, #14               ; >> 14
-    vqrshrn.s32     d5, q6, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d10, q10, #14             ; >> 14
-    vqrshrn.s32     d11, q4, #14              ; >> 14
-
-    ; step1[11] * cospi_16_64
-    vmull.s16       q0, d22, d14
-    vmull.s16       q1, d23, d14
-
-    ; step1[12] * cospi_16_64
-    vmull.s16       q13, d24, d14
-    vmull.s16       q6, d25, d14
-
-    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
-    vsub.s32        q10, q13, q0
-    vsub.s32        q4, q6, q1
-
-    ; temp2 = (step1[11] + step1[12]) * cospi_16_64
-    vadd.s32        q13, q13, q0
-    vadd.s32        q6, q6, q1
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d6, q10, #14              ; >> 14
-    vqrshrn.s32     d7, q4, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d8, q13, #14              ; >> 14
-    vqrshrn.s32     d9, q6, #14               ; >> 14
-
-    mov              r4, #16                  ; pass1Output stride
-    ldr              r3, [sp]                 ; load skip_adding
-    cmp              r3, #0                   ; check if need adding dest data
-    beq              skip_adding_dest
-
-    ldr              r7, [sp, #28]            ; dest used to save element 0-7
-    mov              r9, r7                   ; save dest pointer for later use
-    ldr              r8, [sp, #32]            ; load dest_stride
-
-    ; stage 7
-    ; load the data in pass1
-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
-
-    ; store the data  output 8,9,10,11,12,13,14,15
-    vrshr.s16       q8, q8, #6                ; ROUND_POWER_OF_TWO
-    vaddw.u8        q8, q8, d12               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q8                   ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q9, q9, #6
-    vaddw.u8        q9, q9, d13               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d13, q9                   ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q2, q2, #6
-    vaddw.u8        q2, q2, d12               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q2                   ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q3, q3, #6
-    vaddw.u8        q3, q3, d13               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d13, q3                   ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q4, q4, #6
-    vaddw.u8        q4, q4, d12               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q4                   ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q5, q5, #6
-    vaddw.u8        q5, q5, d13               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d13, q5                   ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q14, q14, #6
-    vaddw.u8        q14, q14, d12             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q14                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q15, q15, #6
-    vaddw.u8        q15, q15, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d13, q15                  ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    b               end_idct16x16_pass2
-
-skip_adding_dest
-    ; stage 7
-    ; load the data in pass1
-    mov              r5, #24
-    mov              r3, #8
-
-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
-    vst1.64         {d24}, [r1], r3           ; store output[0]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[1]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
-    vst1.64         {d24}, [r1], r3           ; store output[2]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[3]
-    vst1.64         {d27}, [r1], r5
-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
-    vst1.64         {d24}, [r1], r3           ; store output[4]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[5]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
-    vst1.64         {d24}, [r1], r3           ; store output[6]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[7]
-    vst1.64         {d27}, [r1], r5
-
-    ; store the data  output 8,9,10,11,12,13,14,15
-    vst1.64         {d16}, [r1], r3
-    vst1.64         {d17}, [r1], r5
-    vst1.64         {d18}, [r1], r3
-    vst1.64         {d19}, [r1], r5
-    vst1.64         {d4}, [r1], r3
-    vst1.64         {d5}, [r1], r5
-    vst1.64         {d6}, [r1], r3
-    vst1.64         {d7}, [r1], r5
-    vst1.64         {d8}, [r1], r3
-    vst1.64         {d9}, [r1], r5
-    vst1.64         {d10}, [r1], r3
-    vst1.64         {d11}, [r1], r5
-    vst1.64         {d28}, [r1], r3
-    vst1.64         {d29}, [r1], r5
-    vst1.64         {d30}, [r1], r3
-    vst1.64         {d31}, [r1], r5
-end_idct16x16_pass2
-    pop             {r3-r9}
-    bx              lr
-    ENDP  ; |vpx_idct16x16_256_add_neon_pass2|
-
-;void |vpx_idct16x16_10_add_neon_pass1|(int16_t *input,
-;                                             int16_t *output, int output_stride)
-;
-; r0  int16_t input
-; r1  int16_t *output
-; r2  int  output_stride)
-
-; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vpx_idct16x16_10_add_neon_pass1| PROC
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q1,q2}, [r0]!
-    vmov.s16        q15, q1
-
-    ; generate  cospi_28_64*2 = 6392
-    mov             r3, #0x1800
-    add             r3, #0xf8
-
-    ; generate cospi_4_64*2  = 32138
-    mov             r12, #0x7d00
-    add             r12, #0x8a
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         q0, r3                    ; duplicate cospi_28_64*2
-    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
-
-    ; The following instructions use vqrdmulh to do the
-    ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply,
-    ; double, and return the high 16 bits, effectively giving >> 15. Doubling
-    ; the constant will change this to >> 14.
-    ; dct_const_round_shift(step2[4] * cospi_28_64);
-    vqrdmulh.s16    q4, q9, q0
-
-    ; preloading to avoid stall
-    ; generate cospi_16_64*2 = 23170
-    mov             r3, #0x5a00
-    add             r3, #0x82
-
-    ; dct_const_round_shift(step2[4] * cospi_4_64);
-    vqrdmulh.s16    q7, q9, q1
-
-    ; stage 4
-    vdup.16         q1, r3                    ; cospi_16_64*2
-
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
-
-    vdup.16         d4, r3;                   ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(step1[0] * cospi_16_64)
-    vqrdmulh.s16    q8, q8, q1
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d14, d4
-    vmull.s16       q10, d15, d4
-
-    ; step2[5] * cospi_16_64
-    vmull.s16       q12, d9, d4
-    vmull.s16       q11, d8, d4
-
-    ; temp1 = (step2[6] - step2[5]) * cospi_16_64
-    vsub.s32        q15, q10, q12
-    vsub.s32        q6, q9, q11
-
-    ; temp2 = (step2[5] + step2[6]) * cospi_16_64
-    vadd.s32        q9, q9, q11
-    vadd.s32        q10, q10, q12
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d11, q15, #14             ; >> 14
-    vqrshrn.s32     d10, q6, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q10, #14             ; >> 14
-
-    ; stage 6
-    vadd.s16        q2, q8, q7                ; step2[0] = step1[0] + step1[7];
-    vadd.s16        q10, q8, q5               ; step2[2] = step1[2] + step1[5];
-    vadd.s16        q11, q8, q4               ; step2[3] = step1[3] + step1[4];
-    vadd.s16        q9, q8, q6                ; step2[1] = step1[1] + step1[6];
-    vsub.s16        q12, q8, q4               ; step2[4] = step1[3] - step1[4];
-    vsub.s16        q13, q8, q5               ; step2[5] = step1[2] - step1[5];
-    vsub.s16        q14, q8, q6               ; step2[6] = step1[1] - step1[6];
-    vsub.s16        q15, q8, q7               ; step2[7] = step1[0] - step1[7];
-
-    ; store the data
-    vst1.64         {d4}, [r1], r2
-    vst1.64         {d5}, [r1], r2
-    vst1.64         {d18}, [r1], r2
-    vst1.64         {d19}, [r1], r2
-    vst1.64         {d20}, [r1], r2
-    vst1.64         {d21}, [r1], r2
-    vst1.64         {d22}, [r1], r2
-    vst1.64         {d23}, [r1], r2
-    vst1.64         {d24}, [r1], r2
-    vst1.64         {d25}, [r1], r2
-    vst1.64         {d26}, [r1], r2
-    vst1.64         {d27}, [r1], r2
-    vst1.64         {d28}, [r1], r2
-    vst1.64         {d29}, [r1], r2
-    vst1.64         {d30}, [r1], r2
-    vst1.64         {d31}, [r1], r2
-
-    bx              lr
-    ENDP  ; |vpx_idct16x16_10_add_neon_pass1|
-
-;void vpx_idct16x16_10_add_neon_pass2(int16_t *src,
-;                                           int16_t *output,
-;                                           int16_t *pass1Output,
-;                                           int16_t skip_adding,
-;                                           uint8_t *dest,
-;                                           int dest_stride)
-;
-; r0  int16_t *src
-; r1  int16_t *output,
-; r2  int16_t *pass1Output,
-; r3  int16_t skip_adding,
-; r4  uint8_t *dest,
-; r5  int dest_stride)
-
-; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vpx_idct16x16_10_add_neon_pass2| PROC
-    push            {r3-r9}
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q0,q1}, [r0]!
-    vmov.s16        q15, q0;
-
-    ; generate 2*cospi_30_64 = 3212
-    mov             r3, #0xc00
-    add             r3, #0x8c
-
-    ; generate 2*cospi_2_64  = 32610
-    mov             r12, #0x7f00
-    add             r12, #0x62
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         q6, r3                    ; duplicate 2*cospi_30_64
-
-    ; dct_const_round_shift(step1[8] * cospi_30_64)
-    vqrdmulh.s16    q0, q8, q6
-
-    vdup.16         q6, r12                   ; duplicate 2*cospi_2_64
-
-    ; dct_const_round_shift(step1[8] * cospi_2_64)
-    vqrdmulh.s16    q7, q8, q6
-
-    ; preloading to avoid stall
-    ; generate 2*cospi_26_64 = 9512
-    mov             r12, #0x2500
-    add             r12, #0x28
-    rsb             r12, #0
-    vdup.16         q15, r12                  ; duplicate -2*cospi_26_64
-
-    ; generate 2*cospi_6_64 = 31358
-    mov             r3, #0x7a00
-    add             r3, #0x7e
-    vdup.16         q14, r3                   ; duplicate 2*cospi_6_64
-
-    ; dct_const_round_shift(- step1[12] * cospi_26_64)
-    vqrdmulh.s16    q3, q9, q15
-
-    ; dct_const_round_shift(step1[12] * cospi_6_64)
-    vqrdmulh.s16    q4, q9, q14
-
-    ; stage 4
-    ; generate cospi_24_64 = 6270
-    mov             r3, #0x1800
-    add             r3, #0x7e
-    vdup.16         d31, r3                   ; duplicate cospi_24_64
-
-    ; generate cospi_8_64 = 15137
-    mov             r12, #0x3b00
-    add             r12, #0x21
-    vdup.16         d30, r12                  ; duplicate cospi_8_64
-
-    ; step1[14] * cospi_24_64
-    vmull.s16       q12, d14, d31
-    vmull.s16       q5, d15, d31
-
-    ; step1[9] * cospi_24_64
-    vmull.s16       q2, d0, d31
-    vmull.s16       q11, d1, d31
-
-    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
-    vmlsl.s16       q12, d0, d30
-    vmlsl.s16       q5, d1, d30
-
-    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
-    vmlal.s16       q2, d14, d30
-    vmlal.s16       q11, d15, d30
-
-    rsb              r12, #0
-    vdup.16          d30, r12                 ; duplicate -cospi_8_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d2, q12, #14              ; >> 14
-    vqrshrn.s32     d3, q5, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q2, #14              ; >> 14
-    vqrshrn.s32     d13, q11, #14             ; >> 14
-
-    ; - step1[13] * cospi_8_64
-    vmull.s16       q10, d8, d30
-    vmull.s16       q13, d9, d30
-
-    ; -step1[10] * cospi_8_64
-    vmull.s16       q8, d6, d30
-    vmull.s16       q9, d7, d30
-
-    ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64
-    vmlsl.s16       q10, d6, d31
-    vmlsl.s16       q13, d7, d31
-
-    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
-    vmlal.s16       q8, d8, d31
-    vmlal.s16       q9, d9, d31
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q10, #14              ; >> 14
-    vqrshrn.s32     d5, q13, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d10, q8, #14              ; >> 14
-    vqrshrn.s32     d11, q9, #14              ; >> 14
-
-    ; stage 5
-    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
-    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
-    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
-    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
-    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
-    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
-    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
-    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
-
-    ; stage 6.
-    ; generate cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
-
-    vdup.16         d14, r12                  ; duplicate cospi_16_64
-
-    ; step1[13] * cospi_16_64
-    vmull.s16       q3, d26, d14
-    vmull.s16       q4, d27, d14
-
-    ; step1[10] * cospi_16_64
-    vmull.s16       q0, d20, d14
-    vmull.s16       q1, d21, d14
-
-    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
-    vsub.s32        q5, q3, q0
-    vsub.s32        q6, q4, q1
-
-    ; temp2 = (step1[10] + step1[13]) * cospi_16_64
-    vadd.s32        q0, q3, q0
-    vadd.s32        q1, q4, q1
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q5, #14               ; >> 14
-    vqrshrn.s32     d5, q6, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d10, q0, #14              ; >> 14
-    vqrshrn.s32     d11, q1, #14              ; >> 14
-
-    ; step1[11] * cospi_16_64
-    vmull.s16       q0, d22, d14
-    vmull.s16       q1, d23, d14
-
-    ; step1[12] * cospi_16_64
-    vmull.s16       q13, d24, d14
-    vmull.s16       q6, d25, d14
-
-    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
-    vsub.s32        q10, q13, q0
-    vsub.s32        q4, q6, q1
-
-    ; temp2 = (step1[11] + step1[12]) * cospi_16_64
-    vadd.s32        q13, q13, q0
-    vadd.s32        q6, q6, q1
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d6, q10, #14              ; >> 14
-    vqrshrn.s32     d7, q4, #14               ; >> 14
-
-    ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);
-    vqrshrn.s32     d8, q13, #14              ; >> 14
-    vqrshrn.s32     d9, q6, #14               ; >> 14
-
-    mov              r4, #16                  ; pass1Output stride
-    ldr              r3, [sp]                 ; load skip_adding
-
-    ; stage 7
-    ; load the data in pass1
-    mov              r5, #24
-    mov              r3, #8
-
-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
-    vst1.64         {d24}, [r1], r3           ; store output[0]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[1]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
-    vst1.64         {d24}, [r1], r3           ; store output[2]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[3]
-    vst1.64         {d27}, [r1], r5
-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
-    vst1.64         {d24}, [r1], r3           ; store output[4]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[5]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
-    vst1.64         {d24}, [r1], r3           ; store output[6]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[7]
-    vst1.64         {d27}, [r1], r5
-
-    ; store the data  output 8,9,10,11,12,13,14,15
-    vst1.64         {d16}, [r1], r3
-    vst1.64         {d17}, [r1], r5
-    vst1.64         {d18}, [r1], r3
-    vst1.64         {d19}, [r1], r5
-    vst1.64         {d4}, [r1], r3
-    vst1.64         {d5}, [r1], r5
-    vst1.64         {d6}, [r1], r3
-    vst1.64         {d7}, [r1], r5
-    vst1.64         {d8}, [r1], r3
-    vst1.64         {d9}, [r1], r5
-    vst1.64         {d10}, [r1], r3
-    vst1.64         {d11}, [r1], r5
-    vst1.64         {d28}, [r1], r3
-    vst1.64         {d29}, [r1], r5
-    vst1.64         {d30}, [r1], r3
-    vst1.64         {d31}, [r1], r5
-end_idct10_16x16_pass2
-    pop             {r3-r9}
-    bx              lr
-    ENDP  ; |vpx_idct16x16_10_add_neon_pass2|
-    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
index 5ca2afe488..5c5963d277 100644
--- a/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
@@ -10,1222 +10,814 @@
 
 #include <arm_neon.h>
 
-#include "./vpx_config.h"
-#include "vpx_dsp/arm/transpose_neon.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
-                                      int output_stride) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-  int16x8x2_t q0x2s16;
-
-  q0x2s16 = vld2q_s16(in);
-  q8s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q9s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q10s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q11s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q12s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q13s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q14s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q15s16 = q0x2s16.val[0];
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-  d30s16 = vget_low_s16(q15s16);
-  d31s16 = vget_high_s16(q15s16);
-
-  // stage 3
-  d0s16 = vdup_n_s16(cospi_28_64);
-  d1s16 = vdup_n_s16(cospi_4_64);
-
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d18s16, d1s16);
-  q6s32 = vmull_s16(d19s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
-  q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
-
-  d2s16 = vdup_n_s16(cospi_12_64);
-  d3s16 = vdup_n_s16(cospi_20_64);
-
-  d8s16 = vqrshrn_n_s32(q2s32, 14);
-  d9s16 = vqrshrn_n_s32(q3s32, 14);
-  d14s16 = vqrshrn_n_s32(q5s32, 14);
-  d15s16 = vqrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  q2s32 = vmull_s16(d26s16, d2s16);
-  q3s32 = vmull_s16(d27s16, d2s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q15s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
-  q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
-
-  d10s16 = vqrshrn_n_s32(q2s32, 14);
-  d11s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q15s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  // stage 4
-  d30s16 = vdup_n_s16(cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d30s16);
-  q11s32 = vmull_s16(d17s16, d30s16);
-  q0s32 = vmull_s16(d24s16, d30s16);
-  q1s32 = vmull_s16(d25s16, d30s16);
-
-  d30s16 = vdup_n_s16(cospi_24_64);
-  d31s16 = vdup_n_s16(cospi_8_64);
-
-  q3s32 = vaddq_s32(q2s32, q0s32);
-  q12s32 = vaddq_s32(q11s32, q1s32);
-  q13s32 = vsubq_s32(q2s32, q0s32);
-  q1s32 = vsubq_s32(q11s32, q1s32);
-
-  d16s16 = vqrshrn_n_s32(q3s32, 14);
-  d17s16 = vqrshrn_n_s32(q12s32, 14);
-  d18s16 = vqrshrn_n_s32(q13s32, 14);
-  d19s16 = vqrshrn_n_s32(q1s32, 14);
-  q8s16 = vcombine_s16(d16s16, d17s16);
-  q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q0s32 = vmull_s16(d20s16, d31s16);
-  q1s32 = vmull_s16(d21s16, d31s16);
-  q12s32 = vmull_s16(d20s16, d30s16);
-  q13s32 = vmull_s16(d21s16, d30s16);
-
-  q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
-  q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
-  q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
-  q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
-
-  d22s16 = vqrshrn_n_s32(q0s32, 14);
-  d23s16 = vqrshrn_n_s32(q1s32, 14);
-  d20s16 = vqrshrn_n_s32(q12s32, 14);
-  d21s16 = vqrshrn_n_s32(q13s32, 14);
-  q10s16 = vcombine_s16(d20s16, d21s16);
-  q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  q14s16 = vsubq_s16(q7s16, q6s16);
-  q15s16 = vaddq_s16(q6s16, q7s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-
-  // stage 5
-  q0s16 = vaddq_s16(q8s16, q11s16);
-  q1s16 = vaddq_s16(q9s16, q10s16);
-  q2s16 = vsubq_s16(q9s16, q10s16);
-  q3s16 = vsubq_s16(q8s16, q11s16);
-
-  d16s16 = vdup_n_s16(cospi_16_64);
-
-  q11s32 = vmull_s16(d26s16, d16s16);
-  q12s32 = vmull_s16(d27s16, d16s16);
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-
-  q6s32 = vsubq_s32(q9s32, q11s32);
-  q13s32 = vsubq_s32(q10s32, q12s32);
-  q9s32 = vaddq_s32(q9s32, q11s32);
-  q10s32 = vaddq_s32(q10s32, q12s32);
-
-  d10s16 = vqrshrn_n_s32(q6s32, 14);
-  d11s16 = vqrshrn_n_s32(q13s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q10s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  // stage 6
-  q8s16 = vaddq_s16(q0s16, q15s16);
-  q9s16 = vaddq_s16(q1s16, q6s16);
-  q10s16 = vaddq_s16(q2s16, q5s16);
-  q11s16 = vaddq_s16(q3s16, q4s16);
-  q12s16 = vsubq_s16(q3s16, q4s16);
-  q13s16 = vsubq_s16(q2s16, q5s16);
-  q14s16 = vsubq_s16(q1s16, q6s16);
-  q15s16 = vsubq_s16(q0s16, q15s16);
-
-  d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
-  d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
-  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-  d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
-  d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
-  d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
-  d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-  // store the data
-  output_stride >>= 1;  // output_stride / 2, out is int16_t
-  vst1_u64((uint64_t *)out, d16u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d17u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d18u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d19u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d20u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d21u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d22u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d23u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d24u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d28u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d29u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d30u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d31u64);
-  return;
+static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
+                                int16x4_t *const d1) {
+  *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
 }
 
-void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
-                                      int16_t *pass1Output, int16_t skip_adding,
-                                      uint8_t *dest, int dest_stride) {
-  uint8_t *d;
-  uint8x8_t d12u8, d13u8;
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64;
-  int64x1_t d12s64, d13s64;
-  uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
-  uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32;
-  int16x8x2_t q0x2s16;
+static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
+                                            const int16x4_t s1,
+                                            const int16x4_t cospi_0_8_16_24,
+                                            int32x4_t *const t32) {
+  t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3);
+  t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3);
+  t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1);
+  t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1);
+}
 
-  q0x2s16 = vld2q_s16(src);
-  q8s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q9s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q10s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q11s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q12s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q13s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q14s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q15s16 = q0x2s16.val[0];
+static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1,
+                                     const int16x4_t cospi_0_8_16_24,
+                                     int16x4_t *const d0, int16x4_t *const d1) {
+  int32x4_t t32[2];
 
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
+  idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
+  wrap_low_4x2(t32, d0, d1);
+}
 
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-  d30s16 = vget_low_s16(q15s16);
-  d31s16 = vget_high_s16(q15s16);
+static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
+                                         const int16x4_t cospi_0_8_16_24,
+                                         int16x4_t *const d0,
+                                         int16x4_t *const d1) {
+  int32x4_t t32[2];
 
-  // stage 3
-  d12s16 = vdup_n_s16(cospi_30_64);
-  d13s16 = vdup_n_s16(cospi_2_64);
+  idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
+  t32[1] = vnegq_s32(t32[1]);
+  wrap_low_4x2(t32, d0, d1);
+}
 
-  q2s32 = vmull_s16(d16s16, d12s16);
-  q3s32 = vmull_s16(d17s16, d12s16);
-  q1s32 = vmull_s16(d16s16, d13s16);
-  q4s32 = vmull_s16(d17s16, d13s16);
+static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t cospi_0_8_16_24,
+                                      int16x4_t *const d0,
+                                      int16x4_t *const d1) {
+  int32x4_t t32[3];
 
-  q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
-  q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
-  q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
+  t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2);
+  t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
+  t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
+  wrap_low_4x2(t32, d0, d1);
+}
 
-  d0s16 = vqrshrn_n_s32(q2s32, 14);
-  d1s16 = vqrshrn_n_s32(q3s32, 14);
-  d14s16 = vqrshrn_n_s32(q1s32, 14);
-  d15s16 = vqrshrn_n_s32(q4s32, 14);
-  q0s16 = vcombine_s16(d0s16, d1s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
+static INLINE void idct16x16_add_store(const int16x8_t *const out,
+                                       uint8_t *dest, const int stride) {
+  // Add the result to dest
+  idct16x16_add8x1(out[0], &dest, stride);
+  idct16x16_add8x1(out[1], &dest, stride);
+  idct16x16_add8x1(out[2], &dest, stride);
+  idct16x16_add8x1(out[3], &dest, stride);
+  idct16x16_add8x1(out[4], &dest, stride);
+  idct16x16_add8x1(out[5], &dest, stride);
+  idct16x16_add8x1(out[6], &dest, stride);
+  idct16x16_add8x1(out[7], &dest, stride);
+  idct16x16_add8x1(out[8], &dest, stride);
+  idct16x16_add8x1(out[9], &dest, stride);
+  idct16x16_add8x1(out[10], &dest, stride);
+  idct16x16_add8x1(out[11], &dest, stride);
+  idct16x16_add8x1(out[12], &dest, stride);
+  idct16x16_add8x1(out[13], &dest, stride);
+  idct16x16_add8x1(out[14], &dest, stride);
+  idct16x16_add8x1(out[15], &dest, stride);
+}
 
-  d30s16 = vdup_n_s16(cospi_14_64);
-  d31s16 = vdup_n_s16(cospi_18_64);
+static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest,
+                                           const int stride) {
+  // Add the result to dest
+  const int16x8_t max = vdupq_n_s16((1 << 8) - 1);
+  out[0] = vrshrq_n_s16(out[0], 6);
+  out[1] = vrshrq_n_s16(out[1], 6);
+  out[2] = vrshrq_n_s16(out[2], 6);
+  out[3] = vrshrq_n_s16(out[3], 6);
+  out[4] = vrshrq_n_s16(out[4], 6);
+  out[5] = vrshrq_n_s16(out[5], 6);
+  out[6] = vrshrq_n_s16(out[6], 6);
+  out[7] = vrshrq_n_s16(out[7], 6);
+  out[8] = vrshrq_n_s16(out[8], 6);
+  out[9] = vrshrq_n_s16(out[9], 6);
+  out[10] = vrshrq_n_s16(out[10], 6);
+  out[11] = vrshrq_n_s16(out[11], 6);
+  out[12] = vrshrq_n_s16(out[12], 6);
+  out[13] = vrshrq_n_s16(out[13], 6);
+  out[14] = vrshrq_n_s16(out[14], 6);
+  out[15] = vrshrq_n_s16(out[15], 6);
+  highbd_idct16x16_add8x1(out[0], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[1], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[2], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[3], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[4], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[5], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[6], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[7], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[8], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[9], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[10], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[11], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[12], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[13], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[14], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[15], max, &dest, stride);
+}
 
-  q2s32 = vmull_s16(d24s16, d30s16);
-  q3s32 = vmull_s16(d25s16, d30s16);
-  q4s32 = vmull_s16(d24s16, d31s16);
-  q5s32 = vmull_s16(d25s16, d31s16);
+void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
+                                  void *const dest, const int stride,
+                                  const int highbd_flag) {
+  const int16x8_t cospis0 = vld1q_s16(kCospi);
+  const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+  const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+  const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0);
+  const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1);
+  const int16x4_t cospi_6_26N_14_18N = vget_high_s16(cospis1);
+  int16x8_t in[16], step1[16], step2[16], out[16];
 
-  q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
-  q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
-  q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
-  q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
-
-  d2s16 = vqrshrn_n_s32(q2s32, 14);
-  d3s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q4s32, 14);
-  d13s16 = vqrshrn_n_s32(q5s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  d30s16 = vdup_n_s16(cospi_22_64);
-  d31s16 = vdup_n_s16(cospi_10_64);
-
-  q11s32 = vmull_s16(d20s16, d30s16);
-  q12s32 = vmull_s16(d21s16, d30s16);
-  q4s32 = vmull_s16(d20s16, d31s16);
-  q5s32 = vmull_s16(d21s16, d31s16);
-
-  q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
-  q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
-  q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
-  q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
-
-  d4s16 = vqrshrn_n_s32(q11s32, 14);
-  d5s16 = vqrshrn_n_s32(q12s32, 14);
-  d11s16 = vqrshrn_n_s32(q5s32, 14);
-  d10s16 = vqrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  d30s16 = vdup_n_s16(cospi_6_64);
-  d31s16 = vdup_n_s16(cospi_26_64);
-
-  q10s32 = vmull_s16(d28s16, d30s16);
-  q11s32 = vmull_s16(d29s16, d30s16);
-  q12s32 = vmull_s16(d28s16, d31s16);
-  q13s32 = vmull_s16(d29s16, d31s16);
-
-  q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
-  q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
-  q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
-  q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
-
-  d6s16 = vqrshrn_n_s32(q10s32, 14);
-  d7s16 = vqrshrn_n_s32(q11s32, 14);
-  d8s16 = vqrshrn_n_s32(q12s32, 14);
-  d9s16 = vqrshrn_n_s32(q13s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-
-  // stage 3
-  q9s16 = vsubq_s16(q0s16, q1s16);
-  q0s16 = vaddq_s16(q0s16, q1s16);
-  q10s16 = vsubq_s16(q3s16, q2s16);
-  q11s16 = vaddq_s16(q2s16, q3s16);
-  q12s16 = vaddq_s16(q4s16, q5s16);
-  q13s16 = vsubq_s16(q4s16, q5s16);
-  q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q6s16, q7s16);
-
-  // stage 4
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-
-  d30s16 = vdup_n_s16(cospi_8_64);
-  d31s16 = vdup_n_s16(cospi_24_64);
-
-  q2s32 = vmull_s16(d18s16, d31s16);
-  q3s32 = vmull_s16(d19s16, d31s16);
-  q4s32 = vmull_s16(d28s16, d31s16);
-  q5s32 = vmull_s16(d29s16, d31s16);
-
-  q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
-  q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
-  q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
-  q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
-
-  d12s16 = vqrshrn_n_s32(q2s32, 14);
-  d13s16 = vqrshrn_n_s32(q3s32, 14);
-  d2s16 = vqrshrn_n_s32(q4s32, 14);
-  d3s16 = vqrshrn_n_s32(q5s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  q3s16 = q11s16;
-  q4s16 = q12s16;
-
-  d30s16 = vdup_n_s16(-cospi_8_64);
-  q11s32 = vmull_s16(d26s16, d30s16);
-  q12s32 = vmull_s16(d27s16, d30s16);
-  q8s32 = vmull_s16(d20s16, d30s16);
-  q9s32 = vmull_s16(d21s16, d30s16);
-
-  q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
-  q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
-  q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
-  q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
-
-  d4s16 = vqrshrn_n_s32(q11s32, 14);
-  d5s16 = vqrshrn_n_s32(q12s32, 14);
-  d10s16 = vqrshrn_n_s32(q8s32, 14);
-  d11s16 = vqrshrn_n_s32(q9s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  // stage 5
-  q8s16 = vaddq_s16(q0s16, q3s16);
-  q9s16 = vaddq_s16(q1s16, q2s16);
-  q10s16 = vsubq_s16(q1s16, q2s16);
-  q11s16 = vsubq_s16(q0s16, q3s16);
-  q12s16 = vsubq_s16(q7s16, q4s16);
-  q13s16 = vsubq_s16(q6s16, q5s16);
-  q14s16 = vaddq_s16(q6s16, q5s16);
-  q15s16 = vaddq_s16(q7s16, q4s16);
-
-  // stage 6
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-
-  d14s16 = vdup_n_s16(cospi_16_64);
-
-  q3s32 = vmull_s16(d26s16, d14s16);
-  q4s32 = vmull_s16(d27s16, d14s16);
-  q0s32 = vmull_s16(d20s16, d14s16);
-  q1s32 = vmull_s16(d21s16, d14s16);
-
-  q5s32 = vsubq_s32(q3s32, q0s32);
-  q6s32 = vsubq_s32(q4s32, q1s32);
-  q10s32 = vaddq_s32(q3s32, q0s32);
-  q4s32 = vaddq_s32(q4s32, q1s32);
-
-  d4s16 = vqrshrn_n_s32(q5s32, 14);
-  d5s16 = vqrshrn_n_s32(q6s32, 14);
-  d10s16 = vqrshrn_n_s32(q10s32, 14);
-  d11s16 = vqrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q0s32 = vmull_s16(d22s16, d14s16);
-  q1s32 = vmull_s16(d23s16, d14s16);
-  q13s32 = vmull_s16(d24s16, d14s16);
-  q6s32 = vmull_s16(d25s16, d14s16);
-
-  q10s32 = vsubq_s32(q13s32, q0s32);
-  q4s32 = vsubq_s32(q6s32, q1s32);
-  q13s32 = vaddq_s32(q13s32, q0s32);
-  q6s32 = vaddq_s32(q6s32, q1s32);
-
-  d6s16 = vqrshrn_n_s32(q10s32, 14);
-  d7s16 = vqrshrn_n_s32(q4s32, 14);
-  d8s16 = vqrshrn_n_s32(q13s32, 14);
-  d9s16 = vqrshrn_n_s32(q6s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-
-  // stage 7
-  if (skip_adding != 0) {
-    d = dest;
-    // load the data in pass1
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-
-    q12s16 = vaddq_s16(q0s16, q15s16);
-    q13s16 = vaddq_s16(q1s16, q14s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q14s16 = vsubq_s16(q1s16, q14s16);
-    q15s16 = vsubq_s16(q0s16, q15s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q12s16 = vaddq_s16(q10s16, q5s16);
-    q13s16 = vaddq_s16(q11s16, q4s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q4s16 = vsubq_s16(q11s16, q4s16);
-    q5s16 = vsubq_s16(q10s16, q5s16);
-
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q12s16 = vaddq_s16(q0s16, q3s16);
-    q13s16 = vaddq_s16(q1s16, q2s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q2s16 = vsubq_s16(q1s16, q2s16);
-    q3s16 = vsubq_s16(q0s16, q3s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q12s16 = vaddq_s16(q10s16, q9s16);
-    q13s16 = vaddq_s16(q11s16, q8s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q8s16 = vsubq_s16(q11s16, q8s16);
-    q9s16 = vsubq_s16(q10s16, q9s16);
-
-    // store the data  out 8,9,10,11,12,13,14,15
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q8s16 = vrshrq_n_s16(q8s16, 6);
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q9s16 = vrshrq_n_s16(q9s16, 6);
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q2s16 = vrshrq_n_s16(q2s16, 6);
-    q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q3s16 = vrshrq_n_s16(q3s16, 6);
-    q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q4s16 = vrshrq_n_s16(q4s16, 6);
-    q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q5s16 = vrshrq_n_s16(q5s16, 6);
-    q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q14s16 = vrshrq_n_s16(q14s16, 6);
-    q14u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    q15s16 = vrshrq_n_s16(q15s16, 6);
-    q15u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-  } else {  // skip_adding_dest
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q0s16, q15s16);
-    q13s16 = vaddq_s16(q1s16, q14s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q14s16 = vsubq_s16(q1s16, q14s16);
-    q15s16 = vsubq_s16(q0s16, q15s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q10s16, q5s16);
-    q13s16 = vaddq_s16(q11s16, q4s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q4s16 = vsubq_s16(q11s16, q4s16);
-    q5s16 = vsubq_s16(q10s16, q5s16);
-
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q0s16, q3s16);
-    q13s16 = vaddq_s16(q1s16, q2s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q2s16 = vsubq_s16(q1s16, q2s16);
-    q3s16 = vsubq_s16(q0s16, q3s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q10s16, q9s16);
-    q13s16 = vaddq_s16(q11s16, q8s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q8s16 = vsubq_s16(q11s16, q8s16);
-    q9s16 = vsubq_s16(q10s16, q9s16);
-
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
+  // Load input (16x8)
+  if (output) {
+    const tran_low_t *inputT = (const tran_low_t *)input;
+    in[0] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[8] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[1] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[9] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[2] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[10] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[3] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[11] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[4] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[12] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[5] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[13] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[6] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[14] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[7] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[15] = load_tran_low_to_s16q(inputT);
+  } else {
+    const int16_t *inputT = (const int16_t *)input;
+    in[0] = vld1q_s16(inputT);
+    inputT += 8;
+    in[8] = vld1q_s16(inputT);
+    inputT += 8;
+    in[1] = vld1q_s16(inputT);
+    inputT += 8;
+    in[9] = vld1q_s16(inputT);
+    inputT += 8;
+    in[2] = vld1q_s16(inputT);
+    inputT += 8;
+    in[10] = vld1q_s16(inputT);
+    inputT += 8;
+    in[3] = vld1q_s16(inputT);
+    inputT += 8;
+    in[11] = vld1q_s16(inputT);
+    inputT += 8;
+    in[4] = vld1q_s16(inputT);
+    inputT += 8;
+    in[12] = vld1q_s16(inputT);
+    inputT += 8;
+    in[5] = vld1q_s16(inputT);
+    inputT += 8;
+    in[13] = vld1q_s16(inputT);
+    inputT += 8;
+    in[6] = vld1q_s16(inputT);
+    inputT += 8;
+    in[14] = vld1q_s16(inputT);
+    inputT += 8;
+    in[7] = vld1q_s16(inputT);
+    inputT += 8;
+    in[15] = vld1q_s16(inputT);
   }
-  return;
-}
 
-void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
-                                     int output_stride) {
-  int16x4_t d4s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-  int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q6s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q15s32;
-  int16x8x2_t q0x2s16;
+  // Transpose
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+  transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+                    &in[15]);
 
-  q0x2s16 = vld2q_s16(in);
-  q8s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q9s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q10s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q11s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q12s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q13s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q14s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q15s16 = q0x2s16.val[0];
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[1] = in[16 / 2];
+  step1[2] = in[8 / 2];
+  step1[3] = in[24 / 2];
+  step1[4] = in[4 / 2];
+  step1[5] = in[20 / 2];
+  step1[6] = in[12 / 2];
+  step1[7] = in[28 / 2];
+  step1[8] = in[2 / 2];
+  step1[9] = in[18 / 2];
+  step1[10] = in[10 / 2];
+  step1[11] = in[26 / 2];
+  step1[12] = in[6 / 2];
+  step1[13] = in[22 / 2];
+  step1[14] = in[14 / 2];
+  step1[15] = in[30 / 2];
 
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+  idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]);
+  idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9],
+                   &step2[14]);
+  idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
+                   &step2[13]);
+  idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11],
+                  &step2[12]);
 
   // stage 3
-  q0s16 = vdupq_n_s16(cospi_28_64 * 2);
-  q1s16 = vdupq_n_s16(cospi_4_64 * 2);
-
-  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+  idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]);
+  idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]);
+  step1[8] = vaddq_s16(step2[8], step2[9]);
+  step1[9] = vsubq_s16(step2[8], step2[9]);
+  step1[10] = vsubq_s16(step2[11], step2[10]);
+  step1[11] = vaddq_s16(step2[11], step2[10]);
+  step1[12] = vaddq_s16(step2[12], step2[13]);
+  step1[13] = vsubq_s16(step2[12], step2[13]);
+  step1[14] = vsubq_s16(step2[15], step2[14]);
+  step1[15] = vaddq_s16(step2[15], step2[14]);
 
   // stage 4
-  q1s16 = vdupq_n_s16(cospi_16_64 * 2);
-  d4s16 = vdup_n_s16(cospi_16_64);
-
-  q8s16 = vqrdmulhq_s16(q8s16, q1s16);
-
-  d8s16 = vget_low_s16(q4s16);
-  d9s16 = vget_high_s16(q4s16);
-  d14s16 = vget_low_s16(q7s16);
-  d15s16 = vget_high_s16(q7s16);
-  q9s32 = vmull_s16(d14s16, d4s16);
-  q10s32 = vmull_s16(d15s16, d4s16);
-  q12s32 = vmull_s16(d9s16, d4s16);
-  q11s32 = vmull_s16(d8s16, d4s16);
-
-  q15s32 = vsubq_s32(q10s32, q12s32);
-  q6s32 = vsubq_s32(q9s32, q11s32);
-  q9s32 = vaddq_s32(q9s32, q11s32);
-  q10s32 = vaddq_s32(q10s32, q12s32);
-
-  d11s16 = vqrshrn_n_s32(q15s32, 14);
-  d10s16 = vqrshrn_n_s32(q6s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q10s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  // stage 6
-  q2s16 = vaddq_s16(q8s16, q7s16);
-  q9s16 = vaddq_s16(q8s16, q6s16);
-  q10s16 = vaddq_s16(q8s16, q5s16);
-  q11s16 = vaddq_s16(q8s16, q4s16);
-  q12s16 = vsubq_s16(q8s16, q4s16);
-  q13s16 = vsubq_s16(q8s16, q5s16);
-  q14s16 = vsubq_s16(q8s16, q6s16);
-  q15s16 = vsubq_s16(q8s16, q7s16);
-
-  d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
-  d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
-  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-  d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
-  d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
-  d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
-  d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-  // store the data
-  output_stride >>= 1;  // output_stride / 2, out is int16_t
-  vst1_u64((uint64_t *)out, d4u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d5u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d18u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d19u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d20u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d21u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d22u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d23u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d24u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d28u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d29u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d30u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d31u64);
-  return;
-}
-
-void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
-                                     int16_t *pass1Output, int16_t skip_adding,
-                                     uint8_t *dest, int dest_stride) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
-  uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
-  uint64x1_t d16u64, d17u64, d18u64, d19u64;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32;
-  int16x8x2_t q0x2s16;
-  (void)skip_adding;
-  (void)dest;
-  (void)dest_stride;
-
-  q0x2s16 = vld2q_s16(src);
-  q8s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q9s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q10s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q11s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q12s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q13s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q14s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q15s16 = q0x2s16.val[0];
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
-
-  // stage 3
-  q6s16 = vdupq_n_s16(cospi_30_64 * 2);
-  q0s16 = vqrdmulhq_s16(q8s16, q6s16);
-  q6s16 = vdupq_n_s16(cospi_2_64 * 2);
-  q7s16 = vqrdmulhq_s16(q8s16, q6s16);
-
-  q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
-  q14s16 = vdupq_n_s16(cospi_6_64 * 2);
-  q3s16 = vqrdmulhq_s16(q9s16, q15s16);
-  q4s16 = vqrdmulhq_s16(q9s16, q14s16);
-
-  // stage 4
-  d0s16 = vget_low_s16(q0s16);
-  d1s16 = vget_high_s16(q0s16);
-  d6s16 = vget_low_s16(q3s16);
-  d7s16 = vget_high_s16(q3s16);
-  d8s16 = vget_low_s16(q4s16);
-  d9s16 = vget_high_s16(q4s16);
-  d14s16 = vget_low_s16(q7s16);
-  d15s16 = vget_high_s16(q7s16);
-
-  d30s16 = vdup_n_s16(cospi_8_64);
-  d31s16 = vdup_n_s16(cospi_24_64);
-
-  q12s32 = vmull_s16(d14s16, d31s16);
-  q5s32 = vmull_s16(d15s16, d31s16);
-  q2s32 = vmull_s16(d0s16, d31s16);
-  q11s32 = vmull_s16(d1s16, d31s16);
-
-  q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
-  q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
-  q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
-  q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
-
-  d2s16 = vqrshrn_n_s32(q12s32, 14);
-  d3s16 = vqrshrn_n_s32(q5s32, 14);
-  d12s16 = vqrshrn_n_s32(q2s32, 14);
-  d13s16 = vqrshrn_n_s32(q11s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  d30s16 = vdup_n_s16(-cospi_8_64);
-  q10s32 = vmull_s16(d8s16, d30s16);
-  q13s32 = vmull_s16(d9s16, d30s16);
-  q8s32 = vmull_s16(d6s16, d30s16);
-  q9s32 = vmull_s16(d7s16, d30s16);
-
-  q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
-  q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
-  q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
-  q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
-
-  d4s16 = vqrshrn_n_s32(q10s32, 14);
-  d5s16 = vqrshrn_n_s32(q13s32, 14);
-  d10s16 = vqrshrn_n_s32(q8s32, 14);
-  d11s16 = vqrshrn_n_s32(q9s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
+  idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]);
+  idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]);
+  step2[4] = vaddq_s16(step1[4], step1[5]);
+  step2[5] = vsubq_s16(step1[4], step1[5]);
+  step2[6] = vsubq_s16(step1[7], step1[6]);
+  step2[7] = vaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                    &step2[14]);
+  idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+                        &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
 
   // stage 5
-  q8s16 = vaddq_s16(q0s16, q3s16);
-  q9s16 = vaddq_s16(q1s16, q2s16);
-  q10s16 = vsubq_s16(q1s16, q2s16);
-  q11s16 = vsubq_s16(q0s16, q3s16);
-  q12s16 = vsubq_s16(q7s16, q4s16);
-  q13s16 = vsubq_s16(q6s16, q5s16);
-  q14s16 = vaddq_s16(q6s16, q5s16);
-  q15s16 = vaddq_s16(q7s16, q4s16);
+  step1[0] = vaddq_s16(step2[0], step2[3]);
+  step1[1] = vaddq_s16(step2[1], step2[2]);
+  step1[2] = vsubq_s16(step2[1], step2[2]);
+  step1[3] = vsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = vaddq_s16(step2[8], step2[11]);
+  step1[9] = vaddq_s16(step2[9], step2[10]);
+  step1[10] = vsubq_s16(step2[9], step2[10]);
+  step1[11] = vsubq_s16(step2[8], step2[11]);
+  step1[12] = vsubq_s16(step2[15], step2[12]);
+  step1[13] = vsubq_s16(step2[14], step2[13]);
+  step1[14] = vaddq_s16(step2[14], step2[13]);
+  step1[15] = vaddq_s16(step2[15], step2[12]);
 
   // stage 6
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-
-  d14s16 = vdup_n_s16(cospi_16_64);
-  q3s32 = vmull_s16(d26s16, d14s16);
-  q4s32 = vmull_s16(d27s16, d14s16);
-  q0s32 = vmull_s16(d20s16, d14s16);
-  q1s32 = vmull_s16(d21s16, d14s16);
-
-  q5s32 = vsubq_s32(q3s32, q0s32);
-  q6s32 = vsubq_s32(q4s32, q1s32);
-  q0s32 = vaddq_s32(q3s32, q0s32);
-  q4s32 = vaddq_s32(q4s32, q1s32);
-
-  d4s16 = vqrshrn_n_s32(q5s32, 14);
-  d5s16 = vqrshrn_n_s32(q6s32, 14);
-  d10s16 = vqrshrn_n_s32(q0s32, 14);
-  d11s16 = vqrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q0s32 = vmull_s16(d22s16, d14s16);
-  q1s32 = vmull_s16(d23s16, d14s16);
-  q13s32 = vmull_s16(d24s16, d14s16);
-  q6s32 = vmull_s16(d25s16, d14s16);
-
-  q10s32 = vsubq_s32(q13s32, q0s32);
-  q4s32 = vsubq_s32(q6s32, q1s32);
-  q13s32 = vaddq_s32(q13s32, q0s32);
-  q6s32 = vaddq_s32(q6s32, q1s32);
-
-  d6s16 = vqrshrn_n_s32(q10s32, 14);
-  d7s16 = vqrshrn_n_s32(q4s32, 14);
-  d8s16 = vqrshrn_n_s32(q13s32, 14);
-  d9s16 = vqrshrn_n_s32(q6s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
+  step2[0] = vaddq_s16(step1[0], step1[7]);
+  step2[1] = vaddq_s16(step1[1], step1[6]);
+  step2[2] = vaddq_s16(step1[2], step1[5]);
+  step2[3] = vaddq_s16(step1[3], step1[4]);
+  step2[4] = vsubq_s16(step1[3], step1[4]);
+  step2[5] = vsubq_s16(step1[2], step1[5]);
+  step2[6] = vsubq_s16(step1[1], step1[6]);
+  step2[7] = vsubq_s16(step1[0], step1[7]);
+  idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                     &step2[13]);
+  idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                     &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
 
   // stage 7
-  q0s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q1s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q12s16 = vaddq_s16(q0s16, q15s16);
-  q13s16 = vaddq_s16(q1s16, q14s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q14s16 = vsubq_s16(q1s16, q14s16);
-  q15s16 = vsubq_s16(q0s16, q15s16);
+  idct16x16_add_stage7(step2, out);
 
-  q10s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q11s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q12s16 = vaddq_s16(q10s16, q5s16);
-  q13s16 = vaddq_s16(q11s16, q4s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q4s16 = vsubq_s16(q11s16, q4s16);
-  q5s16 = vsubq_s16(q10s16, q5s16);
-
-  q0s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q1s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q12s16 = vaddq_s16(q0s16, q3s16);
-  q13s16 = vaddq_s16(q1s16, q2s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q2s16 = vsubq_s16(q1s16, q2s16);
-  q3s16 = vsubq_s16(q0s16, q3s16);
-
-  q10s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q11s16 = vld1q_s16(pass1Output);
-  q12s16 = vaddq_s16(q10s16, q9s16);
-  q13s16 = vaddq_s16(q11s16, q8s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q8s16 = vsubq_s16(q11s16, q8s16);
-  q9s16 = vsubq_s16(q10s16, q9s16);
-
-  d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
-  d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
-  d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
-  d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
-  d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
-  d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
-  d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
-  d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
-  d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
-  d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
-  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-  vst1_u64((uint64_t *)out, d16u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d17u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d18u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d19u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d4u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d5u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d6u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d7u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d8u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d9u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d10u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d11u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d28u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d29u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d30u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d31u64);
-  return;
+  if (output) {
+    idct16x16_store_pass1(out, output);
+  } else {
+    if (highbd_flag) {
+      idct16x16_add_store_bd8(out, dest, stride);
+    } else {
+      idct16x16_add_store(out, dest, stride);
+    }
+  }
+}
+
+void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output,
+                                 void *const dest, const int stride,
+                                 const int highbd_flag) {
+  const int16x8_t cospis0 = vld1q_s16(kCospi);
+  const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+  const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+  const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+  const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+  const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+  const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+  const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+  const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+  int16x8_t in[8], step1[16], step2[16], out[16];
+
+  // Load input (8x8)
+  if (output) {
+    const tran_low_t *inputT = (const tran_low_t *)input;
+    in[0] = load_tran_low_to_s16q(inputT);
+    inputT += 16;
+    in[1] = load_tran_low_to_s16q(inputT);
+    inputT += 16;
+    in[2] = load_tran_low_to_s16q(inputT);
+    inputT += 16;
+    in[3] = load_tran_low_to_s16q(inputT);
+    inputT += 16;
+    in[4] = load_tran_low_to_s16q(inputT);
+    inputT += 16;
+    in[5] = load_tran_low_to_s16q(inputT);
+    inputT += 16;
+    in[6] = load_tran_low_to_s16q(inputT);
+    inputT += 16;
+    in[7] = load_tran_low_to_s16q(inputT);
+  } else {
+    const int16_t *inputT = (const int16_t *)input;
+    in[0] = vld1q_s16(inputT);
+    inputT += 16;
+    in[1] = vld1q_s16(inputT);
+    inputT += 16;
+    in[2] = vld1q_s16(inputT);
+    inputT += 16;
+    in[3] = vld1q_s16(inputT);
+    inputT += 16;
+    in[4] = vld1q_s16(inputT);
+    inputT += 16;
+    in[5] = vld1q_s16(inputT);
+    inputT += 16;
+    in[6] = vld1q_s16(inputT);
+    inputT += 16;
+    in[7] = vld1q_s16(inputT);
+  }
+
+  // Transpose
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[2] = in[8 / 2];
+  step1[4] = in[4 / 2];
+  step1[6] = in[12 / 2];
+  step1[8] = in[2 / 2];
+  step1[10] = in[10 / 2];
+  step1[12] = in[6 / 2];
+  step1[14] = in[14 / 2];  // 0 in pass 1
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[2] = step1[2];
+  step2[4] = step1[4];
+  step2[6] = step1[6];
+  step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
+  step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3);
+  step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3);
+  step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+  step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+  step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2);
+  step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2);
+  step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+  step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+  step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2);
+  step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1);
+  step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+  step1[8] = vaddq_s16(step2[8], step2[9]);
+  step1[9] = vsubq_s16(step2[8], step2[9]);
+  step1[10] = vsubq_s16(step2[11], step2[10]);
+  step1[11] = vaddq_s16(step2[11], step2[10]);
+  step1[12] = vaddq_s16(step2[12], step2[13]);
+  step1[13] = vsubq_s16(step2[12], step2[13]);
+  step1[14] = vsubq_s16(step2[15], step2[14]);
+  step1[15] = vaddq_s16(step2[15], step2[14]);
+
+  // stage 4
+  step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
+  step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3);
+  step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1);
+  step2[4] = vaddq_s16(step1[4], step1[5]);
+  step2[5] = vsubq_s16(step1[4], step1[5]);
+  step2[6] = vsubq_s16(step1[7], step1[6]);
+  step2[7] = vaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                    &step2[14]);
+  idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+                        &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = vaddq_s16(step2[0], step2[3]);
+  step1[1] = vaddq_s16(step2[1], step2[2]);
+  step1[2] = vsubq_s16(step2[1], step2[2]);
+  step1[3] = vsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = vaddq_s16(step2[8], step2[11]);
+  step1[9] = vaddq_s16(step2[9], step2[10]);
+  step1[10] = vsubq_s16(step2[9], step2[10]);
+  step1[11] = vsubq_s16(step2[8], step2[11]);
+  step1[12] = vsubq_s16(step2[15], step2[12]);
+  step1[13] = vsubq_s16(step2[14], step2[13]);
+  step1[14] = vaddq_s16(step2[14], step2[13]);
+  step1[15] = vaddq_s16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = vaddq_s16(step1[0], step1[7]);
+  step2[1] = vaddq_s16(step1[1], step1[6]);
+  step2[2] = vaddq_s16(step1[2], step1[5]);
+  step2[3] = vaddq_s16(step1[3], step1[4]);
+  step2[4] = vsubq_s16(step1[3], step1[4]);
+  step2[5] = vsubq_s16(step1[2], step1[5]);
+  step2[6] = vsubq_s16(step1[1], step1[6]);
+  step2[7] = vsubq_s16(step1[0], step1[7]);
+  idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                     &step2[13]);
+  idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                     &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  idct16x16_add_stage7(step2, out);
+
+  if (output) {
+    idct16x16_store_pass1(out, output);
+  } else {
+    if (highbd_flag) {
+      idct16x16_add_store_bd8(out, dest, stride);
+    } else {
+      idct16x16_add_store(out, dest, stride);
+    }
+  }
+}
+
+void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+                                       int16_t *output) {
+  const int16x8_t cospis0 = vld1q_s16(kCospi);
+  const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+  const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+  const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+  const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+  const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+  const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+  const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+  const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+  int16x4_t in[4], step1[16], step2[16], out[16];
+
+  // Load input (4x4)
+  in[0] = load_tran_low_to_s16d(input);
+  input += 16;
+  in[1] = load_tran_low_to_s16d(input);
+  input += 16;
+  in[2] = load_tran_low_to_s16d(input);
+  input += 16;
+  in[3] = load_tran_low_to_s16d(input);
+
+  // Transpose
+  transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[4] = in[4 / 2];
+  step1[8] = in[2 / 2];
+  step1[12] = in[6 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[4] = step1[4];
+  step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1);
+  step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+  step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+  step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+  step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+  step2[8] = step1[8];
+  idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                    &step2[14]);
+  idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+                        &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = vadd_s16(step2[8], step2[11]);
+  step1[9] = vadd_s16(step2[9], step2[10]);
+  step1[10] = vsub_s16(step2[9], step2[10]);
+  step1[11] = vsub_s16(step2[8], step2[11]);
+  step1[12] = vsub_s16(step2[15], step2[12]);
+  step1[13] = vsub_s16(step2[14], step2[13]);
+  step1[14] = vadd_s16(step2[14], step2[13]);
+  step1[15] = vadd_s16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = vadd_s16(step1[0], step1[7]);
+  step2[1] = vadd_s16(step1[1], step1[6]);
+  step2[2] = vadd_s16(step1[2], step1[5]);
+  step2[3] = vadd_s16(step1[3], step1[4]);
+  step2[4] = vsub_s16(step1[3], step1[4]);
+  step2[5] = vsub_s16(step1[2], step1[5]);
+  step2[6] = vsub_s16(step1[1], step1[6]);
+  step2[7] = vsub_s16(step1[0], step1[7]);
+  idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                     &step2[13]);
+  idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                     &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  out[0] = vadd_s16(step2[0], step2[15]);
+  out[1] = vadd_s16(step2[1], step2[14]);
+  out[2] = vadd_s16(step2[2], step2[13]);
+  out[3] = vadd_s16(step2[3], step2[12]);
+  out[4] = vadd_s16(step2[4], step2[11]);
+  out[5] = vadd_s16(step2[5], step2[10]);
+  out[6] = vadd_s16(step2[6], step2[9]);
+  out[7] = vadd_s16(step2[7], step2[8]);
+  out[8] = vsub_s16(step2[7], step2[8]);
+  out[9] = vsub_s16(step2[6], step2[9]);
+  out[10] = vsub_s16(step2[5], step2[10]);
+  out[11] = vsub_s16(step2[4], step2[11]);
+  out[12] = vsub_s16(step2[3], step2[12]);
+  out[13] = vsub_s16(step2[2], step2[13]);
+  out[14] = vsub_s16(step2[1], step2[14]);
+  out[15] = vsub_s16(step2[0], step2[15]);
+
+  // pass 1: save the result into output
+  vst1_s16(output, out[0]);
+  output += 4;
+  vst1_s16(output, out[1]);
+  output += 4;
+  vst1_s16(output, out[2]);
+  output += 4;
+  vst1_s16(output, out[3]);
+  output += 4;
+  vst1_s16(output, out[4]);
+  output += 4;
+  vst1_s16(output, out[5]);
+  output += 4;
+  vst1_s16(output, out[6]);
+  output += 4;
+  vst1_s16(output, out[7]);
+  output += 4;
+  vst1_s16(output, out[8]);
+  output += 4;
+  vst1_s16(output, out[9]);
+  output += 4;
+  vst1_s16(output, out[10]);
+  output += 4;
+  vst1_s16(output, out[11]);
+  output += 4;
+  vst1_s16(output, out[12]);
+  output += 4;
+  vst1_s16(output, out[13]);
+  output += 4;
+  vst1_s16(output, out[14]);
+  output += 4;
+  vst1_s16(output, out[15]);
+}
+
+void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input,
+                                       int16_t *const output, void *const dest,
+                                       const int stride,
+                                       const int highbd_flag) {
+  const int16x8_t cospis0 = vld1q_s16(kCospi);
+  const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+  const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+  const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+  const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+  const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+  const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+  const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+  const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+  int16x4_t ind[8];
+  int16x8_t in[4], step1[16], step2[16], out[16];
+
+  // Load input (4x8)
+  ind[0] = vld1_s16(input);
+  input += 4;
+  ind[1] = vld1_s16(input);
+  input += 4;
+  ind[2] = vld1_s16(input);
+  input += 4;
+  ind[3] = vld1_s16(input);
+  input += 4;
+  ind[4] = vld1_s16(input);
+  input += 4;
+  ind[5] = vld1_s16(input);
+  input += 4;
+  ind[6] = vld1_s16(input);
+  input += 4;
+  ind[7] = vld1_s16(input);
+
+  // Transpose
+  transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6],
+                    ind[7], &in[0], &in[1], &in[2], &in[3]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[4] = in[4 / 2];
+  step1[8] = in[2 / 2];
+  step1[12] = in[6 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[4] = step1[4];
+  step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
+  step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+  step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+  step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+  step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+  step2[8] = step1[8];
+  idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                    &step2[14]);
+  idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+                        &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = vaddq_s16(step2[8], step2[11]);
+  step1[9] = vaddq_s16(step2[9], step2[10]);
+  step1[10] = vsubq_s16(step2[9], step2[10]);
+  step1[11] = vsubq_s16(step2[8], step2[11]);
+  step1[12] = vsubq_s16(step2[15], step2[12]);
+  step1[13] = vsubq_s16(step2[14], step2[13]);
+  step1[14] = vaddq_s16(step2[14], step2[13]);
+  step1[15] = vaddq_s16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = vaddq_s16(step1[0], step1[7]);
+  step2[1] = vaddq_s16(step1[1], step1[6]);
+  step2[2] = vaddq_s16(step1[2], step1[5]);
+  step2[3] = vaddq_s16(step1[3], step1[4]);
+  step2[4] = vsubq_s16(step1[3], step1[4]);
+  step2[5] = vsubq_s16(step1[2], step1[5]);
+  step2[6] = vsubq_s16(step1[1], step1[6]);
+  step2[7] = vsubq_s16(step1[0], step1[7]);
+  idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                     &step2[13]);
+  idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                     &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  idct16x16_add_stage7(step2, out);
+
+  if (output) {
+    idct16x16_store_pass1(out, output);
+  } else {
+    if (highbd_flag) {
+      idct16x16_add_store_bd8(out, dest, stride);
+    } else {
+      idct16x16_add_store(out, dest, stride);
+    }
+  }
+}
+
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  int16_t row_idct_output[16 * 16];
+
+  // pass 1
+  // Parallel idct on the upper 8 rows
+  vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0);
+
+  // Parallel idct on the lower 8 rows
+  vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest,
+                               stride, 0);
+
+  // pass 2
+  // Parallel idct to get the left 8 columns
+  vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0);
+
+  // Parallel idct to get the right 8 columns
+  vpx_idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
+                               0);
+}
+
+void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  int16_t row_idct_output[16 * 16];
+
+  // pass 1
+  // Parallel idct on the upper 8 rows
+  vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 0);
+
+  // pass 2
+  // Parallel idct to get the left 8 columns
+  vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 0);
+
+  // Parallel idct to get the right 8 columns
+  vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
+                              0);
+}
+
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  int16_t row_idct_output[4 * 16];
+
+  // pass 1
+  // Parallel idct on the upper 8 rows
+  vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+  // pass 2
+  // Parallel idct to get the left 8 columns
+  vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 0);
+
+  // Parallel idct to get the right 8 columns
+  vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
+                                    stride, 0);
 }
diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_neon.c b/libs/libvpx/vpx_dsp/arm/idct16x16_neon.c
deleted file mode 100644
index ecc263df28..0000000000
--- a/libs/libvpx/vpx_dsp/arm/idct16x16_neon.c
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_dsp/vpx_dsp_common.h"
-
-void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
-                                      int output_stride);
-void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
-                                      int16_t *pass1Output, int16_t skip_adding,
-                                      uint8_t *dest, int dest_stride);
-void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
-                                     int output_stride);
-void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
-                                     int16_t *pass1Output, int16_t skip_adding,
-                                     uint8_t *dest, int dest_stride);
-
-#if HAVE_NEON_ASM
-/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
-extern void vpx_push_neon(int64_t *store);
-extern void vpx_pop_neon(int64_t *store);
-#endif  // HAVE_NEON_ASM
-
-void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
-                                int dest_stride) {
-#if HAVE_NEON_ASM
-  int64_t store_reg[8];
-#endif
-  int16_t pass1_output[16 * 16] = { 0 };
-  int16_t row_idct_output[16 * 16] = { 0 };
-
-#if HAVE_NEON_ASM
-  // save d8-d15 register values.
-  vpx_push_neon(store_reg);
-#endif
-
-  /* Parallel idct on the upper 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vpx_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
-                                   dest, dest_stride);
-
-  /* Parallel idct on the lower 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vpx_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
-                                   pass1_output, 0, dest, dest_stride);
-
-  /* Parallel idct on the left 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
-                                   pass1_output, 1, dest, dest_stride);
-
-  /* Parallel idct on the right 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
-                                   row_idct_output + 8, pass1_output, 1,
-                                   dest + 8, dest_stride);
-
-#if HAVE_NEON_ASM
-  // restore d8-d15 register values.
-  vpx_pop_neon(store_reg);
-#endif
-
-  return;
-}
-
-void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
-                               int dest_stride) {
-#if HAVE_NEON_ASM
-  int64_t store_reg[8];
-#endif
-  int16_t pass1_output[16 * 16] = { 0 };
-  int16_t row_idct_output[16 * 16] = { 0 };
-
-#if HAVE_NEON_ASM
-  // save d8-d15 register values.
-  vpx_push_neon(store_reg);
-#endif
-
-  /* Parallel idct on the upper 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
-                                  dest, dest_stride);
-
-  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
-
-  /* Parallel idct on the left 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
-                                   pass1_output, 1, dest, dest_stride);
-
-  /* Parallel idct on the right 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
-                                   row_idct_output + 8, pass1_output, 1,
-                                   dest + 8, dest_stride);
-
-#if HAVE_NEON_ASM
-  // restore d8-d15 register values.
-  vpx_pop_neon(store_reg);
-#endif
-
-  return;
-}
diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
new file mode 100644
index 0000000000..021211bc99
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -0,0 +1,678 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_8x8_s16(const tran_low_t *input, int16x8_t *const in0,
+                                int16x8_t *const in1, int16x8_t *const in2,
+                                int16x8_t *const in3, int16x8_t *const in4,
+                                int16x8_t *const in5, int16x8_t *const in6,
+                                int16x8_t *const in7) {
+  *in0 = load_tran_low_to_s16q(input);
+  input += 32;
+  *in1 = load_tran_low_to_s16q(input);
+  input += 32;
+  *in2 = load_tran_low_to_s16q(input);
+  input += 32;
+  *in3 = load_tran_low_to_s16q(input);
+  input += 32;
+  *in4 = load_tran_low_to_s16q(input);
+  input += 32;
+  *in5 = load_tran_low_to_s16q(input);
+  input += 32;
+  *in6 = load_tran_low_to_s16q(input);
+  input += 32;
+  *in7 = load_tran_low_to_s16q(input);
+}
+
+static INLINE void load_4x8_s16(const tran_low_t *input, int16x4_t *const in0,
+                                int16x4_t *const in1, int16x4_t *const in2,
+                                int16x4_t *const in3, int16x4_t *const in4,
+                                int16x4_t *const in5, int16x4_t *const in6,
+                                int16x4_t *const in7) {
+  *in0 = load_tran_low_to_s16d(input);
+  input += 32;
+  *in1 = load_tran_low_to_s16d(input);
+  input += 32;
+  *in2 = load_tran_low_to_s16d(input);
+  input += 32;
+  *in3 = load_tran_low_to_s16d(input);
+  input += 32;
+  *in4 = load_tran_low_to_s16d(input);
+  input += 32;
+  *in5 = load_tran_low_to_s16d(input);
+  input += 32;
+  *in6 = load_tran_low_to_s16d(input);
+  input += 32;
+  *in7 = load_tran_low_to_s16d(input);
+}
+
+// Only for the first pass of the  _135_ variant. Since it only uses values from
+// the top left 16x16 it can safely assume all the remaining values are 0 and
+// skip an awful lot of calculations. In fact, only the first 12 columns make
+// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are
+// used so it skips any calls to input[12|13|14|15] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 12x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero
+// coefficients as follows:
+//      0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+//  0   0   2   5  10  17  25  38  47  62  83 101 121
+//  1   1   4   8  15  22  30  45  58  74  92 112 133
+//  2   3   7  12  18  28  36  52  64  82 102 118
+//  3   6  11  16  23  31  43  60  73  90 109 126
+//  4   9  14  19  29  37  50  65  78  98 116 134
+//  5  13  20  26  35  44  54  72  85 105 123
+//  6  21  27  33  42  53  63  80  94 113 132
+//  7  24  32  39  48  57  71  88 104 120
+//  8  34  40  46  56  68  81  96 111 130
+//  9  41  49  55  67  77  91 107 124
+// 10  51  59  66  76  89  99 119 131
+// 11  61  69  75  87 100 114 129
+// 12  70  79  86  97 108 122
+// 13  84  93 103 110 125
+// 14  98 106 115 127
+// 15 117 128
+void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output) {
+  int16x4_t tmp[8];
+  int16x8_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32];
+
+  load_8x8_s16(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+               &in[7]);
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+
+  load_4x8_s16(input + 8, &tmp[0], &tmp[1], &tmp[2], &tmp[3], &tmp[4], &tmp[5],
+               &tmp[6], &tmp[7]);
+  transpose_s16_4x8(tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6],
+                    tmp[7], &in[8], &in[9], &in[10], &in[11]);
+
+  // stage 1
+  s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
+
+  s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64);
+  s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64);
+
+  s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
+
+  s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
+
+  s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64);
+  s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64);
+
+  s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
+
+  // stage 2
+  s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
+
+  s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64);
+  s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64);
+
+  s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
+
+  s2[18] = vsubq_s16(s1[19], s1[18]);
+  s2[19] = vaddq_s16(s1[18], s1[19]);
+  s2[20] = vaddq_s16(s1[20], s1[21]);
+  s2[21] = vsubq_s16(s1[20], s1[21]);
+  s2[26] = vsubq_s16(s1[27], s1[26]);
+  s2[27] = vaddq_s16(s1[26], s1[27]);
+  s2[28] = vaddq_s16(s1[28], s1[29]);
+  s2[29] = vsubq_s16(s1[28], s1[29]);
+
+  // stage 3
+  s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+  s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
+
+  s3[10] = vsubq_s16(s2[11], s2[10]);
+  s3[11] = vaddq_s16(s2[10], s2[11]);
+  s3[12] = vaddq_s16(s2[12], s2[13]);
+  s3[13] = vsubq_s16(s2[12], s2[13]);
+
+  s3[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+                                                    cospi_28_64);
+  s3[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+                                                    cospi_4_64);
+
+  s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64,
+                                                    s2[29], -cospi_4_64);
+  s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29],
+                                                    cospi_28_64);
+
+  s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64,
+                                                    s2[26], cospi_12_64);
+  s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26],
+                                                    cospi_20_64);
+
+  s3[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+                                                    s1[24], -cospi_20_64);
+  s3[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+                                                    s1[24], cospi_12_64);
+
+  // stage 4
+  s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+  s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64);
+  s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64);
+
+  s4[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
+                                                   cospi_24_64);
+  s4[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+                                                    cospi_8_64);
+
+  s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64,
+                                                    s3[13], -cospi_8_64);
+  s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13],
+                                                    cospi_24_64);
+
+  s4[16] = vaddq_s16(s1[16], s2[19]);
+  s4[17] = vaddq_s16(s3[17], s3[18]);
+  s4[18] = vsubq_s16(s3[17], s3[18]);
+  s4[19] = vsubq_s16(s1[16], s2[19]);
+  s4[20] = vsubq_s16(s1[23], s2[20]);
+  s4[21] = vsubq_s16(s3[22], s3[21]);
+  s4[22] = vaddq_s16(s3[21], s3[22]);
+  s4[23] = vaddq_s16(s2[20], s1[23]);
+  s4[24] = vaddq_s16(s1[24], s2[27]);
+  s4[25] = vaddq_s16(s3[25], s3[26]);
+  s4[26] = vsubq_s16(s3[25], s3[26]);
+  s4[27] = vsubq_s16(s1[24], s2[27]);
+  s4[28] = vsubq_s16(s1[31], s2[28]);
+  s4[29] = vsubq_s16(s3[30], s3[29]);
+  s4[30] = vaddq_s16(s3[29], s3[30]);
+  s4[31] = vaddq_s16(s2[28], s1[31]);
+
+  // stage 5
+  s5[0] = vaddq_s16(s4[0], s4[3]);
+  s5[1] = vaddq_s16(s4[0], s4[2]);
+  s5[2] = vsubq_s16(s4[0], s4[2]);
+  s5[3] = vsubq_s16(s4[0], s4[3]);
+
+  s5[5] = sub_multiply_shift_and_narrow_s16(s3[7], s3[4], cospi_16_64);
+  s5[6] = add_multiply_shift_and_narrow_s16(s3[4], s3[7], cospi_16_64);
+
+  s5[8] = vaddq_s16(s2[8], s3[11]);
+  s5[9] = vaddq_s16(s4[9], s4[10]);
+  s5[10] = vsubq_s16(s4[9], s4[10]);
+  s5[11] = vsubq_s16(s2[8], s3[11]);
+  s5[12] = vsubq_s16(s2[15], s3[12]);
+  s5[13] = vsubq_s16(s4[14], s4[13]);
+  s5[14] = vaddq_s16(s4[13], s4[14]);
+  s5[15] = vaddq_s16(s2[15], s3[12]);
+
+  s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29],
+                                                    cospi_24_64);
+  s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29],
+                                                    cospi_8_64);
+
+  s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28],
+                                                    cospi_24_64);
+  s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28],
+                                                    cospi_8_64);
+
+  s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64,
+                                                    s4[27], -cospi_8_64);
+  s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27],
+                                                    cospi_24_64);
+
+  s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64,
+                                                    s4[26], -cospi_8_64);
+  s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26],
+                                                    cospi_24_64);
+
+  // stage 6
+  s6[0] = vaddq_s16(s5[0], s3[7]);
+  s6[1] = vaddq_s16(s5[1], s5[6]);
+  s6[2] = vaddq_s16(s5[2], s5[5]);
+  s6[3] = vaddq_s16(s5[3], s3[4]);
+  s6[4] = vsubq_s16(s5[3], s3[4]);
+  s6[5] = vsubq_s16(s5[2], s5[5]);
+  s6[6] = vsubq_s16(s5[1], s5[6]);
+  s6[7] = vsubq_s16(s5[0], s3[7]);
+
+  s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64);
+  s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64);
+
+  s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64);
+  s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64);
+
+  s6[16] = vaddq_s16(s4[16], s4[23]);
+  s6[17] = vaddq_s16(s4[17], s4[22]);
+  s6[18] = vaddq_s16(s5[18], s5[21]);
+  s6[19] = vaddq_s16(s5[19], s5[20]);
+  s6[20] = vsubq_s16(s5[19], s5[20]);
+  s6[21] = vsubq_s16(s5[18], s5[21]);
+  s6[22] = vsubq_s16(s4[17], s4[22]);
+  s6[23] = vsubq_s16(s4[16], s4[23]);
+
+  s6[24] = vsubq_s16(s4[31], s4[24]);
+  s6[25] = vsubq_s16(s4[30], s4[25]);
+  s6[26] = vsubq_s16(s5[29], s5[26]);
+  s6[27] = vsubq_s16(s5[28], s5[27]);
+  s6[28] = vaddq_s16(s5[27], s5[28]);
+  s6[29] = vaddq_s16(s5[26], s5[29]);
+  s6[30] = vaddq_s16(s4[25], s4[30]);
+  s6[31] = vaddq_s16(s4[24], s4[31]);
+
+  // stage 7
+  s7[0] = vaddq_s16(s6[0], s5[15]);
+  s7[1] = vaddq_s16(s6[1], s5[14]);
+  s7[2] = vaddq_s16(s6[2], s6[13]);
+  s7[3] = vaddq_s16(s6[3], s6[12]);
+  s7[4] = vaddq_s16(s6[4], s6[11]);
+  s7[5] = vaddq_s16(s6[5], s6[10]);
+  s7[6] = vaddq_s16(s6[6], s5[9]);
+  s7[7] = vaddq_s16(s6[7], s5[8]);
+  s7[8] = vsubq_s16(s6[7], s5[8]);
+  s7[9] = vsubq_s16(s6[6], s5[9]);
+  s7[10] = vsubq_s16(s6[5], s6[10]);
+  s7[11] = vsubq_s16(s6[4], s6[11]);
+  s7[12] = vsubq_s16(s6[3], s6[12]);
+  s7[13] = vsubq_s16(s6[2], s6[13]);
+  s7[14] = vsubq_s16(s6[1], s5[14]);
+  s7[15] = vsubq_s16(s6[0], s5[15]);
+
+  s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64);
+  s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64);
+
+  s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64);
+  s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64);
+
+  s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64);
+  s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64);
+
+  s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64);
+  s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64);
+
+  // final stage
+  vst1q_s16(output, vaddq_s16(s7[0], s6[31]));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7[1], s6[30]));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7[2], s6[29]));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7[3], s6[28]));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7[4], s7[27]));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7[5], s7[26]));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7[6], s7[25]));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7[7], s7[24]));
+  output += 16;
+
+  vst1q_s16(output, vaddq_s16(s7[8], s7[23]));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7[9], s7[22]));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7[10], s7[21]));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7[11], s7[20]));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7[12], s6[19]));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7[13], s6[18]));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7[14], s6[17]));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7[15], s6[16]));
+  output += 16;
+
+  vst1q_s16(output, vsubq_s16(s7[15], s6[16]));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7[14], s6[17]));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7[13], s6[18]));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7[12], s6[19]));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7[11], s7[20]));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7[10], s7[21]));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7[9], s7[22]));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7[8], s7[23]));
+  output += 16;
+
+  vst1q_s16(output, vsubq_s16(s7[7], s7[24]));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7[6], s7[25]));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7[5], s7[26]));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7[4], s7[27]));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7[3], s6[28]));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7[2], s6[29]));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7[1], s6[30]));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7[0], s6[31]));
+}
+
+void vpx_idct32_16_neon(const int16_t *const input, void *const output,
+                        const int stride, const int highbd_flag) {
+  int16x8_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
+      out[32];
+
+  load_and_transpose_s16_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4],
+                             &in[5], &in[6], &in[7]);
+
+  load_and_transpose_s16_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11],
+                             &in[12], &in[13], &in[14], &in[15]);
+
+  // stage 1
+  s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
+
+  s1[17] = multiply_shift_and_narrow_s16(in[15], -cospi_17_64);
+  s1[30] = multiply_shift_and_narrow_s16(in[15], cospi_15_64);
+
+  s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64);
+  s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64);
+
+  s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
+
+  s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
+
+  s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64);
+  s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64);
+
+  s1[22] = multiply_shift_and_narrow_s16(in[13], cospi_19_64);
+  s1[25] = multiply_shift_and_narrow_s16(in[13], cospi_13_64);
+
+  s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
+
+  // stage 2
+  s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
+
+  s2[9] = multiply_shift_and_narrow_s16(in[14], -cospi_18_64);
+  s2[14] = multiply_shift_and_narrow_s16(in[14], cospi_14_64);
+
+  s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64);
+  s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64);
+
+  s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
+
+  s2[16] = vaddq_s16(s1[16], s1[17]);
+  s2[17] = vsubq_s16(s1[16], s1[17]);
+  s2[18] = vsubq_s16(s1[19], s1[18]);
+  s2[19] = vaddq_s16(s1[18], s1[19]);
+  s2[20] = vaddq_s16(s1[20], s1[21]);
+  s2[21] = vsubq_s16(s1[20], s1[21]);
+  s2[22] = vsubq_s16(s1[23], s1[22]);
+  s2[23] = vaddq_s16(s1[22], s1[23]);
+  s2[24] = vaddq_s16(s1[24], s1[25]);
+  s2[25] = vsubq_s16(s1[24], s1[25]);
+  s2[26] = vsubq_s16(s1[27], s1[26]);
+  s2[27] = vaddq_s16(s1[26], s1[27]);
+  s2[28] = vaddq_s16(s1[28], s1[29]);
+  s2[29] = vsubq_s16(s1[28], s1[29]);
+  s2[30] = vsubq_s16(s1[31], s1[30]);
+  s2[31] = vaddq_s16(s1[30], s1[31]);
+
+  // stage 3
+  s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+  s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
+
+  s3[5] = multiply_shift_and_narrow_s16(in[12], -cospi_20_64);
+  s3[6] = multiply_shift_and_narrow_s16(in[12], cospi_12_64);
+
+  s3[8] = vaddq_s16(s2[8], s2[9]);
+  s3[9] = vsubq_s16(s2[8], s2[9]);
+  s3[10] = vsubq_s16(s2[11], s2[10]);
+  s3[11] = vaddq_s16(s2[10], s2[11]);
+  s3[12] = vaddq_s16(s2[12], s2[13]);
+  s3[13] = vsubq_s16(s2[12], s2[13]);
+  s3[14] = vsubq_s16(s2[15], s2[14]);
+  s3[15] = vaddq_s16(s2[14], s2[15]);
+
+  s3[17] = multiply_accumulate_shift_and_narrow_s16(s2[17], -cospi_4_64, s2[30],
+                                                    cospi_28_64);
+  s3[30] = multiply_accumulate_shift_and_narrow_s16(s2[17], cospi_28_64, s2[30],
+                                                    cospi_4_64);
+
+  s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64,
+                                                    s2[29], -cospi_4_64);
+  s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29],
+                                                    cospi_28_64);
+
+  s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64,
+                                                    s2[26], cospi_12_64);
+  s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26],
+                                                    cospi_20_64);
+
+  s3[22] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_12_64,
+                                                    s2[25], -cospi_20_64);
+  s3[25] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_20_64,
+                                                    s2[25], cospi_12_64);
+
+  // stage 4
+  s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+  s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64);
+  s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64);
+
+  s4[4] = vaddq_s16(s3[4], s3[5]);
+  s4[5] = vsubq_s16(s3[4], s3[5]);
+  s4[6] = vsubq_s16(s3[7], s3[6]);
+  s4[7] = vaddq_s16(s3[6], s3[7]);
+
+  s4[9] = multiply_accumulate_shift_and_narrow_s16(s3[9], -cospi_8_64, s3[14],
+                                                   cospi_24_64);
+  s4[14] = multiply_accumulate_shift_and_narrow_s16(s3[9], cospi_24_64, s3[14],
+                                                    cospi_8_64);
+
+  s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64,
+                                                    s3[13], -cospi_8_64);
+  s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13],
+                                                    cospi_24_64);
+
+  s4[16] = vaddq_s16(s2[16], s2[19]);
+  s4[17] = vaddq_s16(s3[17], s3[18]);
+  s4[18] = vsubq_s16(s3[17], s3[18]);
+  s4[19] = vsubq_s16(s2[16], s2[19]);
+  s4[20] = vsubq_s16(s2[23], s2[20]);
+  s4[21] = vsubq_s16(s3[22], s3[21]);
+  s4[22] = vaddq_s16(s3[21], s3[22]);
+  s4[23] = vaddq_s16(s2[20], s2[23]);
+  s4[24] = vaddq_s16(s2[24], s2[27]);
+  s4[25] = vaddq_s16(s3[25], s3[26]);
+  s4[26] = vsubq_s16(s3[25], s3[26]);
+  s4[27] = vsubq_s16(s2[24], s2[27]);
+  s4[28] = vsubq_s16(s2[31], s2[28]);
+  s4[29] = vsubq_s16(s3[30], s3[29]);
+  s4[30] = vaddq_s16(s3[29], s3[30]);
+  s4[31] = vaddq_s16(s2[28], s2[31]);
+
+  // stage 5
+  s5[0] = vaddq_s16(s4[0], s4[3]);
+  s5[1] = vaddq_s16(s4[0], s4[2]);
+  s5[2] = vsubq_s16(s4[0], s4[2]);
+  s5[3] = vsubq_s16(s4[0], s4[3]);
+
+  s5[5] = sub_multiply_shift_and_narrow_s16(s4[6], s4[5], cospi_16_64);
+  s5[6] = add_multiply_shift_and_narrow_s16(s4[5], s4[6], cospi_16_64);
+
+  s5[8] = vaddq_s16(s3[8], s3[11]);
+  s5[9] = vaddq_s16(s4[9], s4[10]);
+  s5[10] = vsubq_s16(s4[9], s4[10]);
+  s5[11] = vsubq_s16(s3[8], s3[11]);
+  s5[12] = vsubq_s16(s3[15], s3[12]);
+  s5[13] = vsubq_s16(s4[14], s4[13]);
+  s5[14] = vaddq_s16(s4[13], s4[14]);
+  s5[15] = vaddq_s16(s3[15], s3[12]);
+
+  s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29],
+                                                    cospi_24_64);
+  s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29],
+                                                    cospi_8_64);
+
+  s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28],
+                                                    cospi_24_64);
+  s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28],
+                                                    cospi_8_64);
+
+  s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64,
+                                                    s4[27], -cospi_8_64);
+  s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27],
+                                                    cospi_24_64);
+
+  s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64,
+                                                    s4[26], -cospi_8_64);
+  s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26],
+                                                    cospi_24_64);
+
+  // stage 6
+  s6[0] = vaddq_s16(s5[0], s4[7]);
+  s6[1] = vaddq_s16(s5[1], s5[6]);
+  s6[2] = vaddq_s16(s5[2], s5[5]);
+  s6[3] = vaddq_s16(s5[3], s4[4]);
+  s6[4] = vsubq_s16(s5[3], s4[4]);
+  s6[5] = vsubq_s16(s5[2], s5[5]);
+  s6[6] = vsubq_s16(s5[1], s5[6]);
+  s6[7] = vsubq_s16(s5[0], s4[7]);
+
+  s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64);
+  s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64);
+
+  s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64);
+  s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64);
+
+  s6[16] = vaddq_s16(s4[16], s4[23]);
+  s6[17] = vaddq_s16(s4[17], s4[22]);
+  s6[18] = vaddq_s16(s5[18], s5[21]);
+  s6[19] = vaddq_s16(s5[19], s5[20]);
+  s6[20] = vsubq_s16(s5[19], s5[20]);
+  s6[21] = vsubq_s16(s5[18], s5[21]);
+  s6[22] = vsubq_s16(s4[17], s4[22]);
+  s6[23] = vsubq_s16(s4[16], s4[23]);
+  s6[24] = vsubq_s16(s4[31], s4[24]);
+  s6[25] = vsubq_s16(s4[30], s4[25]);
+  s6[26] = vsubq_s16(s5[29], s5[26]);
+  s6[27] = vsubq_s16(s5[28], s5[27]);
+  s6[28] = vaddq_s16(s5[27], s5[28]);
+  s6[29] = vaddq_s16(s5[26], s5[29]);
+  s6[30] = vaddq_s16(s4[25], s4[30]);
+  s6[31] = vaddq_s16(s4[24], s4[31]);
+
+  // stage 7
+  s7[0] = vaddq_s16(s6[0], s5[15]);
+  s7[1] = vaddq_s16(s6[1], s5[14]);
+  s7[2] = vaddq_s16(s6[2], s6[13]);
+  s7[3] = vaddq_s16(s6[3], s6[12]);
+  s7[4] = vaddq_s16(s6[4], s6[11]);
+  s7[5] = vaddq_s16(s6[5], s6[10]);
+  s7[6] = vaddq_s16(s6[6], s5[9]);
+  s7[7] = vaddq_s16(s6[7], s5[8]);
+  s7[8] = vsubq_s16(s6[7], s5[8]);
+  s7[9] = vsubq_s16(s6[6], s5[9]);
+  s7[10] = vsubq_s16(s6[5], s6[10]);
+  s7[11] = vsubq_s16(s6[4], s6[11]);
+  s7[12] = vsubq_s16(s6[3], s6[12]);
+  s7[13] = vsubq_s16(s6[2], s6[13]);
+  s7[14] = vsubq_s16(s6[1], s5[14]);
+  s7[15] = vsubq_s16(s6[0], s5[15]);
+
+  s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64);
+  s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64);
+
+  s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64);
+  s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64);
+
+  s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64);
+  s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64);
+
+  s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64);
+  s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64);
+
+  // final stage
+  out[0] = final_add(s7[0], s6[31]);
+  out[1] = final_add(s7[1], s6[30]);
+  out[2] = final_add(s7[2], s6[29]);
+  out[3] = final_add(s7[3], s6[28]);
+  out[4] = final_add(s7[4], s7[27]);
+  out[5] = final_add(s7[5], s7[26]);
+  out[6] = final_add(s7[6], s7[25]);
+  out[7] = final_add(s7[7], s7[24]);
+  out[8] = final_add(s7[8], s7[23]);
+  out[9] = final_add(s7[9], s7[22]);
+  out[10] = final_add(s7[10], s7[21]);
+  out[11] = final_add(s7[11], s7[20]);
+  out[12] = final_add(s7[12], s6[19]);
+  out[13] = final_add(s7[13], s6[18]);
+  out[14] = final_add(s7[14], s6[17]);
+  out[15] = final_add(s7[15], s6[16]);
+  out[16] = final_sub(s7[15], s6[16]);
+  out[17] = final_sub(s7[14], s6[17]);
+  out[18] = final_sub(s7[13], s6[18]);
+  out[19] = final_sub(s7[12], s6[19]);
+  out[20] = final_sub(s7[11], s7[20]);
+  out[21] = final_sub(s7[10], s7[21]);
+  out[22] = final_sub(s7[9], s7[22]);
+  out[23] = final_sub(s7[8], s7[23]);
+  out[24] = final_sub(s7[7], s7[24]);
+  out[25] = final_sub(s7[6], s7[25]);
+  out[26] = final_sub(s7[5], s7[26]);
+  out[27] = final_sub(s7[4], s7[27]);
+  out[28] = final_sub(s7[3], s6[28]);
+  out[29] = final_sub(s7[2], s6[29]);
+  out[30] = final_sub(s7[1], s6[30]);
+  out[31] = final_sub(s7[0], s6[31]);
+
+  if (highbd_flag) {
+    highbd_add_and_store_bd8(out, output, stride);
+  } else {
+    uint8_t *const outputT = (uint8_t *)output;
+    add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
+                         out[7], outputT, stride);
+    add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13],
+                         out[14], out[15], outputT + (8 * stride), stride);
+    add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21],
+                         out[22], out[23], outputT + (16 * stride), stride);
+    add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29],
+                         out[30], out[31], outputT + (24 * stride), stride);
+  }
+}
+
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  int i;
+  int16_t temp[32 * 16];
+  int16_t *t = temp;
+
+  vpx_idct32_12_neon(input, temp);
+  vpx_idct32_12_neon(input + 32 * 8, temp + 8);
+
+  for (i = 0; i < 32; i += 8) {
+    vpx_idct32_16_neon(t, dest, stride, 0);
+    t += (16 * 8);
+    dest += 8;
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm
deleted file mode 100644
index 96d276b4d1..0000000000
--- a/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm
+++ /dev/null
@@ -1,144 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-    EXPORT  |vpx_idct32x32_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ;TODO(hkuang): put the following macros in a seperate
-    ;file so other idct function could also use them.
-    MACRO
-    LD_16x8          $src, $stride
-    vld1.8           {q8}, [$src], $stride
-    vld1.8           {q9}, [$src], $stride
-    vld1.8           {q10}, [$src], $stride
-    vld1.8           {q11}, [$src], $stride
-    vld1.8           {q12}, [$src], $stride
-    vld1.8           {q13}, [$src], $stride
-    vld1.8           {q14}, [$src], $stride
-    vld1.8           {q15}, [$src], $stride
-    MEND
-
-    MACRO
-    ADD_DIFF_16x8    $diff
-    vqadd.u8         q8, q8, $diff
-    vqadd.u8         q9, q9, $diff
-    vqadd.u8         q10, q10, $diff
-    vqadd.u8         q11, q11, $diff
-    vqadd.u8         q12, q12, $diff
-    vqadd.u8         q13, q13, $diff
-    vqadd.u8         q14, q14, $diff
-    vqadd.u8         q15, q15, $diff
-    MEND
-
-    MACRO
-    SUB_DIFF_16x8    $diff
-    vqsub.u8         q8, q8, $diff
-    vqsub.u8         q9, q9, $diff
-    vqsub.u8         q10, q10, $diff
-    vqsub.u8         q11, q11, $diff
-    vqsub.u8         q12, q12, $diff
-    vqsub.u8         q13, q13, $diff
-    vqsub.u8         q14, q14, $diff
-    vqsub.u8         q15, q15, $diff
-    MEND
-
-    MACRO
-    ST_16x8          $dst, $stride
-    vst1.8           {q8}, [$dst], $stride
-    vst1.8           {q9}, [$dst], $stride
-    vst1.8           {q10},[$dst], $stride
-    vst1.8           {q11},[$dst], $stride
-    vst1.8           {q12},[$dst], $stride
-    vst1.8           {q13},[$dst], $stride
-    vst1.8           {q14},[$dst], $stride
-    vst1.8           {q15},[$dst], $stride
-    MEND
-
-;void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
-;                              int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-
-|vpx_idct32x32_1_add_neon| PROC
-    push             {lr}
-    pld              [r1]
-    add              r3, r1, #16               ; r3 dest + 16 for second loop
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 6)
-    add              r0, r0, #32               ; + (1 <<((6) - 1))
-    asrs             r0, r0, #6                ; >> 6
-    bge              diff_positive_32_32
-
-diff_negative_32_32
-    neg              r0, r0
-    usat             r0, #8, r0
-    vdup.u8          q0, r0
-    mov              r0, #4
-
-diff_negative_32_32_loop
-    sub              r0, #1
-    LD_16x8          r1, r2
-    SUB_DIFF_16x8    q0
-    ST_16x8          r12, r2
-
-    LD_16x8          r1, r2
-    SUB_DIFF_16x8    q0
-    ST_16x8          r12, r2
-    cmp              r0, #2
-    moveq            r1, r3
-    moveq            r12, r3
-    cmp              r0, #0
-    bne              diff_negative_32_32_loop
-    pop              {pc}
-
-diff_positive_32_32
-    usat             r0, #8, r0
-    vdup.u8          q0, r0
-    mov              r0, #4
-
-diff_positive_32_32_loop
-    sub              r0, #1
-    LD_16x8          r1, r2
-    ADD_DIFF_16x8    q0
-    ST_16x8          r12, r2
-
-    LD_16x8          r1, r2
-    ADD_DIFF_16x8    q0
-    ST_16x8          r12, r2
-    cmp              r0, #2
-    moveq            r1, r3
-    moveq            r12, r3
-    cmp              r0, #0
-    bne              diff_positive_32_32_loop
-    pop              {pc}
-
-    ENDP             ; |vpx_idct32x32_1_add_neon|
-    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
index dab7d098e8..8920b93363 100644
--- a/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -10,131 +10,49 @@
 
 #include <arm_neon.h>
 
-#include "./vpx_config.h"
-
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
 
-static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
-                           uint8x16_t *q9u8, uint8x16_t *q10u8,
-                           uint8x16_t *q11u8, uint8x16_t *q12u8,
-                           uint8x16_t *q13u8, uint8x16_t *q14u8,
-                           uint8x16_t *q15u8) {
-  *q8u8 = vld1q_u8(d);
-  d += d_stride;
-  *q9u8 = vld1q_u8(d);
-  d += d_stride;
-  *q10u8 = vld1q_u8(d);
-  d += d_stride;
-  *q11u8 = vld1q_u8(d);
-  d += d_stride;
-  *q12u8 = vld1q_u8(d);
-  d += d_stride;
-  *q13u8 = vld1q_u8(d);
-  d += d_stride;
-  *q14u8 = vld1q_u8(d);
-  d += d_stride;
-  *q15u8 = vld1q_u8(d);
-  return;
+static INLINE void idct32x32_1_add_pos_kernel(uint8_t **dest, const int stride,
+                                              const uint8x16_t res) {
+  const uint8x16_t a0 = vld1q_u8(*dest);
+  const uint8x16_t a1 = vld1q_u8(*dest + 16);
+  const uint8x16_t b0 = vqaddq_u8(a0, res);
+  const uint8x16_t b1 = vqaddq_u8(a1, res);
+  vst1q_u8(*dest, b0);
+  vst1q_u8(*dest + 16, b1);
+  *dest += stride;
 }
 
-static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
-                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
-                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
-                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
-                                 uint8x16_t *q15u8) {
-  *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
-  *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
-  *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
-  *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
-  *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
-  *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
-  *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
-  *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
-  return;
+static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride,
+                                              const uint8x16_t res) {
+  const uint8x16_t a0 = vld1q_u8(*dest);
+  const uint8x16_t a1 = vld1q_u8(*dest + 16);
+  const uint8x16_t b0 = vqsubq_u8(a0, res);
+  const uint8x16_t b1 = vqsubq_u8(a1, res);
+  vst1q_u8(*dest, b0);
+  vst1q_u8(*dest + 16, b1);
+  *dest += stride;
 }
 
-static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
-                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
-                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
-                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
-                                 uint8x16_t *q15u8) {
-  *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
-  *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
-  *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
-  *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
-  *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
-  *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
-  *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
-  *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
-  return;
-}
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
+  int i;
+  const int16_t out0 =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+  const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
 
-static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
-                           uint8x16_t *q9u8, uint8x16_t *q10u8,
-                           uint8x16_t *q11u8, uint8x16_t *q12u8,
-                           uint8x16_t *q13u8, uint8x16_t *q14u8,
-                           uint8x16_t *q15u8) {
-  vst1q_u8(d, *q8u8);
-  d += d_stride;
-  vst1q_u8(d, *q9u8);
-  d += d_stride;
-  vst1q_u8(d, *q10u8);
-  d += d_stride;
-  vst1q_u8(d, *q11u8);
-  d += d_stride;
-  vst1q_u8(d, *q12u8);
-  d += d_stride;
-  vst1q_u8(d, *q13u8);
-  d += d_stride;
-  vst1q_u8(d, *q14u8);
-  d += d_stride;
-  vst1q_u8(d, *q15u8);
-  return;
-}
-
-void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
-  int i, j, dest_stride8;
-  uint8_t *d;
-  int16_t a1, cospi_16_64 = 11585;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  dest_stride8 = dest_stride * 8;
-  if (a1 >= 0) {  // diff_positive_32_32
-    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-    q0u8 = vdupq_n_u8(a1);
-    for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
-      d = dest;
-      for (j = 0; j < 4; j++) {
-        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                      &q14u8, &q15u8);
-        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        d += dest_stride8;
-      }
+  if (a1 >= 0) {
+    const uint8x16_t dc = create_dcq(a1);
+    for (i = 0; i < 32; i++) {
+      idct32x32_1_add_pos_kernel(&dest, stride, dc);
     }
-  } else {  // diff_negative_32_32
-    a1 = -a1;
-    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-    q0u8 = vdupq_n_u8(a1);
-    for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
-      d = dest;
-      for (j = 0; j < 4; j++) {
-        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                      &q14u8, &q15u8);
-        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        d += dest_stride8;
-      }
+  } else {
+    const uint8x16_t dc = create_dcq(-a1);
+    for (i = 0; i < 32; i++) {
+      idct32x32_1_add_neg_kernel(&dest, stride, dc);
     }
   }
-  return;
 }
diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
new file mode 100644
index 0000000000..f3c336fa31
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -0,0 +1,517 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+// Only for the first pass of the  _34_ variant. Since it only uses values from
+// the top left 8x8 it can safely assume all the remaining values are 0 and skip
+// an awful lot of calculations. In fact, only the first 6 columns make the cut.
+// None of the elements in the 7th or 8th column are used so it skips any calls
+// to input[67] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 8x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero
+// coefficients as follows:
+//    0  1  2  3  4  5  6  7
+// 0  0  2  5 10 17 25
+// 1  1  4  8 15 22 30
+// 2  3  7 12 18 28
+// 3  6 11 16 23 31
+// 4  9 14 19 29
+// 5 13 20 26
+// 6 21 27 33
+// 7 24 32
+void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output) {
+  int16x8_t in[8], s1[32], s2[32], s3[32];
+
+  in[0] = load_tran_low_to_s16q(input);
+  input += 32;
+  in[1] = load_tran_low_to_s16q(input);
+  input += 32;
+  in[2] = load_tran_low_to_s16q(input);
+  input += 32;
+  in[3] = load_tran_low_to_s16q(input);
+  input += 32;
+  in[4] = load_tran_low_to_s16q(input);
+  input += 32;
+  in[5] = load_tran_low_to_s16q(input);
+  input += 32;
+  in[6] = load_tran_low_to_s16q(input);
+  input += 32;
+  in[7] = load_tran_low_to_s16q(input);
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+
+  // stage 1
+  // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
+  s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+  // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
+  s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
+
+  s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
+
+  s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
+
+  // stage 2
+  s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
+
+  // stage 3
+  s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+  s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
+
+  s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+                                                    cospi_28_64);
+  s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+                                                    cospi_4_64);
+
+  s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64,
+                                                    s1[27], cospi_12_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27],
+                                                    cospi_20_64);
+
+  s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+                                                    s1[24], -cospi_20_64);
+  s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+                                                    s1[24], cospi_12_64);
+
+  // stage 4
+  s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+
+  s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
+                                                   cospi_24_64);
+  s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+                                                    cospi_8_64);
+
+  s2[20] = vsubq_s16(s1[23], s1[20]);
+  s2[21] = vsubq_s16(s1[22], s1[21]);
+  s2[22] = vaddq_s16(s1[21], s1[22]);
+  s2[23] = vaddq_s16(s1[20], s1[23]);
+  s2[24] = vaddq_s16(s1[24], s1[27]);
+  s2[25] = vaddq_s16(s1[25], s1[26]);
+  s2[26] = vsubq_s16(s1[25], s1[26]);
+  s2[27] = vsubq_s16(s1[24], s1[27]);
+
+  // stage 5
+  s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64);
+  s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64);
+
+  s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[17], -cospi_8_64, s1[30],
+                                                    cospi_24_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[17], cospi_24_64, s1[30],
+                                                    cospi_8_64);
+
+  s1[19] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_8_64, s1[31],
+                                                    cospi_24_64);
+  s1[28] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_24_64, s1[31],
+                                                    cospi_8_64);
+
+  s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64,
+                                                    s2[27], -cospi_8_64);
+  s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27],
+                                                    cospi_24_64);
+
+  s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64,
+                                                    s2[26], -cospi_8_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26],
+                                                    cospi_24_64);
+
+  // stage 6
+  s2[0] = vaddq_s16(s1[0], s1[7]);
+  s2[1] = vaddq_s16(s1[0], s1[6]);
+  s2[2] = vaddq_s16(s1[0], s1[5]);
+  s2[3] = vaddq_s16(s1[0], s1[4]);
+  s2[4] = vsubq_s16(s1[0], s1[4]);
+  s2[5] = vsubq_s16(s1[0], s1[5]);
+  s2[6] = vsubq_s16(s1[0], s1[6]);
+  s2[7] = vsubq_s16(s1[0], s1[7]);
+
+  s2[10] = sub_multiply_shift_and_narrow_s16(s2[14], s2[9], cospi_16_64);
+  s2[13] = add_multiply_shift_and_narrow_s16(s2[9], s2[14], cospi_16_64);
+
+  s2[11] = sub_multiply_shift_and_narrow_s16(s2[15], s2[8], cospi_16_64);
+  s2[12] = add_multiply_shift_and_narrow_s16(s2[8], s2[15], cospi_16_64);
+
+  s2[16] = vaddq_s16(s1[16], s2[23]);
+  s2[17] = vaddq_s16(s1[17], s2[22]);
+  s2[18] = vaddq_s16(s1[18], s1[21]);
+  s2[19] = vaddq_s16(s1[19], s1[20]);
+  s2[20] = vsubq_s16(s1[19], s1[20]);
+  s2[21] = vsubq_s16(s1[18], s1[21]);
+  s2[22] = vsubq_s16(s1[17], s2[22]);
+  s2[23] = vsubq_s16(s1[16], s2[23]);
+
+  s3[24] = vsubq_s16(s1[31], s2[24]);
+  s3[25] = vsubq_s16(s1[30], s2[25]);
+  s3[26] = vsubq_s16(s1[29], s1[26]);
+  s3[27] = vsubq_s16(s1[28], s1[27]);
+  s2[28] = vaddq_s16(s1[27], s1[28]);
+  s2[29] = vaddq_s16(s1[26], s1[29]);
+  s2[30] = vaddq_s16(s2[25], s1[30]);
+  s2[31] = vaddq_s16(s2[24], s1[31]);
+
+  // stage 7
+  s1[0] = vaddq_s16(s2[0], s2[15]);
+  s1[1] = vaddq_s16(s2[1], s2[14]);
+  s1[2] = vaddq_s16(s2[2], s2[13]);
+  s1[3] = vaddq_s16(s2[3], s2[12]);
+  s1[4] = vaddq_s16(s2[4], s2[11]);
+  s1[5] = vaddq_s16(s2[5], s2[10]);
+  s1[6] = vaddq_s16(s2[6], s2[9]);
+  s1[7] = vaddq_s16(s2[7], s2[8]);
+  s1[8] = vsubq_s16(s2[7], s2[8]);
+  s1[9] = vsubq_s16(s2[6], s2[9]);
+  s1[10] = vsubq_s16(s2[5], s2[10]);
+  s1[11] = vsubq_s16(s2[4], s2[11]);
+  s1[12] = vsubq_s16(s2[3], s2[12]);
+  s1[13] = vsubq_s16(s2[2], s2[13]);
+  s1[14] = vsubq_s16(s2[1], s2[14]);
+  s1[15] = vsubq_s16(s2[0], s2[15]);
+
+  s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64);
+  s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64);
+
+  s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64);
+  s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64);
+
+  s1[22] = sub_multiply_shift_and_narrow_s16(s3[25], s2[22], cospi_16_64);
+  s1[25] = add_multiply_shift_and_narrow_s16(s2[22], s3[25], cospi_16_64);
+
+  s1[23] = sub_multiply_shift_and_narrow_s16(s3[24], s2[23], cospi_16_64);
+  s1[24] = add_multiply_shift_and_narrow_s16(s2[23], s3[24], cospi_16_64);
+
+  // final stage
+  vst1q_s16(output, vaddq_s16(s1[0], s2[31]));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1[1], s2[30]));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1[2], s2[29]));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1[3], s2[28]));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1[4], s1[27]));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1[5], s1[26]));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1[6], s1[25]));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1[7], s1[24]));
+  output += 8;
+
+  vst1q_s16(output, vaddq_s16(s1[8], s1[23]));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1[9], s1[22]));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1[10], s1[21]));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1[11], s1[20]));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1[12], s2[19]));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1[13], s2[18]));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1[14], s2[17]));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1[15], s2[16]));
+  output += 8;
+
+  vst1q_s16(output, vsubq_s16(s1[15], s2[16]));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1[14], s2[17]));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1[13], s2[18]));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1[12], s2[19]));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1[11], s1[20]));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1[10], s1[21]));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1[9], s1[22]));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1[8], s1[23]));
+  output += 8;
+
+  vst1q_s16(output, vsubq_s16(s1[7], s1[24]));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1[6], s1[25]));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1[5], s1[26]));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1[4], s1[27]));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1[3], s2[28]));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1[2], s2[29]));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1[1], s2[30]));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1[0], s2[31]));
+}
+
+void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
+                       const int highbd_flag) {
+  int16x8_t in[8], s1[32], s2[32], s3[32], out[32];
+
+  load_and_transpose_s16_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4],
+                             &in[5], &in[6], &in[7]);
+
+  // stage 1
+  s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
+
+  // Different for _8_
+  s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
+
+  s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
+
+  s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
+
+  // stage 2
+  s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
+
+  s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
+
+  // stage 3
+  s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+  s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
+
+  s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+                                                    cospi_28_64);
+  s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+                                                    cospi_4_64);
+
+  // Different for _8_
+  s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_28_64,
+                                                    s1[28], -cospi_4_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_4_64, s1[28],
+                                                    cospi_28_64);
+
+  s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64,
+                                                    s1[27], cospi_12_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27],
+                                                    cospi_20_64);
+
+  s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+                                                    s1[24], -cospi_20_64);
+  s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+                                                    s1[24], cospi_12_64);
+
+  // stage 4
+  s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+
+  s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
+                                                   cospi_24_64);
+  s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+                                                    cospi_8_64);
+
+  s2[10] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_24_64,
+                                                    s2[12], -cospi_8_64);
+  s2[13] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_8_64, s2[12],
+                                                    cospi_24_64);
+
+  s2[16] = vaddq_s16(s1[16], s1[19]);
+
+  s2[17] = vaddq_s16(s1[17], s1[18]);
+  s2[18] = vsubq_s16(s1[17], s1[18]);
+
+  s2[19] = vsubq_s16(s1[16], s1[19]);
+
+  s2[20] = vsubq_s16(s1[23], s1[20]);
+  s2[21] = vsubq_s16(s1[22], s1[21]);
+
+  s2[22] = vaddq_s16(s1[21], s1[22]);
+  s2[23] = vaddq_s16(s1[20], s1[23]);
+
+  s2[24] = vaddq_s16(s1[24], s1[27]);
+  s2[25] = vaddq_s16(s1[25], s1[26]);
+  s2[26] = vsubq_s16(s1[25], s1[26]);
+  s2[27] = vsubq_s16(s1[24], s1[27]);
+
+  s2[28] = vsubq_s16(s1[31], s1[28]);
+  s2[29] = vsubq_s16(s1[30], s1[29]);
+  s2[30] = vaddq_s16(s1[29], s1[30]);
+  s2[31] = vaddq_s16(s1[28], s1[31]);
+
+  // stage 5
+  s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64);
+  s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64);
+
+  s1[8] = vaddq_s16(s2[8], s2[11]);
+  s1[9] = vaddq_s16(s2[9], s2[10]);
+  s1[10] = vsubq_s16(s2[9], s2[10]);
+  s1[11] = vsubq_s16(s2[8], s2[11]);
+  s1[12] = vsubq_s16(s2[15], s2[12]);
+  s1[13] = vsubq_s16(s2[14], s2[13]);
+  s1[14] = vaddq_s16(s2[13], s2[14]);
+  s1[15] = vaddq_s16(s2[12], s2[15]);
+
+  s1[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_8_64, s2[29],
+                                                    cospi_24_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], cospi_24_64, s2[29],
+                                                    cospi_8_64);
+
+  s1[19] = multiply_accumulate_shift_and_narrow_s16(s2[19], -cospi_8_64, s2[28],
+                                                    cospi_24_64);
+  s1[28] = multiply_accumulate_shift_and_narrow_s16(s2[19], cospi_24_64, s2[28],
+                                                    cospi_8_64);
+
+  s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64,
+                                                    s2[27], -cospi_8_64);
+  s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27],
+                                                    cospi_24_64);
+
+  s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64,
+                                                    s2[26], -cospi_8_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26],
+                                                    cospi_24_64);
+
+  // stage 6
+  s2[0] = vaddq_s16(s1[0], s1[7]);
+  s2[1] = vaddq_s16(s1[0], s1[6]);
+  s2[2] = vaddq_s16(s1[0], s1[5]);
+  s2[3] = vaddq_s16(s1[0], s1[4]);
+  s2[4] = vsubq_s16(s1[0], s1[4]);
+  s2[5] = vsubq_s16(s1[0], s1[5]);
+  s2[6] = vsubq_s16(s1[0], s1[6]);
+  s2[7] = vsubq_s16(s1[0], s1[7]);
+
+  s2[10] = sub_multiply_shift_and_narrow_s16(s1[13], s1[10], cospi_16_64);
+  s2[13] = add_multiply_shift_and_narrow_s16(s1[10], s1[13], cospi_16_64);
+
+  s2[11] = sub_multiply_shift_and_narrow_s16(s1[12], s1[11], cospi_16_64);
+  s2[12] = add_multiply_shift_and_narrow_s16(s1[11], s1[12], cospi_16_64);
+
+  s1[16] = vaddq_s16(s2[16], s2[23]);
+  s1[17] = vaddq_s16(s2[17], s2[22]);
+  s2[18] = vaddq_s16(s1[18], s1[21]);
+  s2[19] = vaddq_s16(s1[19], s1[20]);
+  s2[20] = vsubq_s16(s1[19], s1[20]);
+  s2[21] = vsubq_s16(s1[18], s1[21]);
+  s1[22] = vsubq_s16(s2[17], s2[22]);
+  s1[23] = vsubq_s16(s2[16], s2[23]);
+
+  s3[24] = vsubq_s16(s2[31], s2[24]);
+  s3[25] = vsubq_s16(s2[30], s2[25]);
+  s3[26] = vsubq_s16(s1[29], s1[26]);
+  s3[27] = vsubq_s16(s1[28], s1[27]);
+  s2[28] = vaddq_s16(s1[27], s1[28]);
+  s2[29] = vaddq_s16(s1[26], s1[29]);
+  s2[30] = vaddq_s16(s2[25], s2[30]);
+  s2[31] = vaddq_s16(s2[24], s2[31]);
+
+  // stage 7
+  s1[0] = vaddq_s16(s2[0], s1[15]);
+  s1[1] = vaddq_s16(s2[1], s1[14]);
+  s1[2] = vaddq_s16(s2[2], s2[13]);
+  s1[3] = vaddq_s16(s2[3], s2[12]);
+  s1[4] = vaddq_s16(s2[4], s2[11]);
+  s1[5] = vaddq_s16(s2[5], s2[10]);
+  s1[6] = vaddq_s16(s2[6], s1[9]);
+  s1[7] = vaddq_s16(s2[7], s1[8]);
+  s1[8] = vsubq_s16(s2[7], s1[8]);
+  s1[9] = vsubq_s16(s2[6], s1[9]);
+  s1[10] = vsubq_s16(s2[5], s2[10]);
+  s1[11] = vsubq_s16(s2[4], s2[11]);
+  s1[12] = vsubq_s16(s2[3], s2[12]);
+  s1[13] = vsubq_s16(s2[2], s2[13]);
+  s1[14] = vsubq_s16(s2[1], s1[14]);
+  s1[15] = vsubq_s16(s2[0], s1[15]);
+
+  s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64);
+  s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64);
+
+  s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64);
+  s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64);
+
+  s2[22] = sub_multiply_shift_and_narrow_s16(s3[25], s1[22], cospi_16_64);
+  s1[25] = add_multiply_shift_and_narrow_s16(s1[22], s3[25], cospi_16_64);
+
+  s2[23] = sub_multiply_shift_and_narrow_s16(s3[24], s1[23], cospi_16_64);
+  s1[24] = add_multiply_shift_and_narrow_s16(s1[23], s3[24], cospi_16_64);
+
+  // final stage
+  out[0] = final_add(s1[0], s2[31]);
+  out[1] = final_add(s1[1], s2[30]);
+  out[2] = final_add(s1[2], s2[29]);
+  out[3] = final_add(s1[3], s2[28]);
+  out[4] = final_add(s1[4], s1[27]);
+  out[5] = final_add(s1[5], s1[26]);
+  out[6] = final_add(s1[6], s1[25]);
+  out[7] = final_add(s1[7], s1[24]);
+  out[8] = final_add(s1[8], s2[23]);
+  out[9] = final_add(s1[9], s2[22]);
+  out[10] = final_add(s1[10], s1[21]);
+  out[11] = final_add(s1[11], s1[20]);
+  out[12] = final_add(s1[12], s2[19]);
+  out[13] = final_add(s1[13], s2[18]);
+  out[14] = final_add(s1[14], s1[17]);
+  out[15] = final_add(s1[15], s1[16]);
+  out[16] = final_sub(s1[15], s1[16]);
+  out[17] = final_sub(s1[14], s1[17]);
+  out[18] = final_sub(s1[13], s2[18]);
+  out[19] = final_sub(s1[12], s2[19]);
+  out[20] = final_sub(s1[11], s1[20]);
+  out[21] = final_sub(s1[10], s1[21]);
+  out[22] = final_sub(s1[9], s2[22]);
+  out[23] = final_sub(s1[8], s2[23]);
+  out[24] = final_sub(s1[7], s1[24]);
+  out[25] = final_sub(s1[6], s1[25]);
+  out[26] = final_sub(s1[5], s1[26]);
+  out[27] = final_sub(s1[4], s1[27]);
+  out[28] = final_sub(s1[3], s2[28]);
+  out[29] = final_sub(s1[2], s2[29]);
+  out[30] = final_sub(s1[1], s2[30]);
+  out[31] = final_sub(s1[0], s2[31]);
+
+  if (highbd_flag) {
+    highbd_add_and_store_bd8(out, output, stride);
+  } else {
+    uint8_t *const outputT = (uint8_t *)output;
+    add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
+                         out[7], outputT, stride);
+    add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13],
+                         out[14], out[15], outputT + (8 * stride), stride);
+    add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21],
+                         out[22], out[23], outputT + (16 * stride), stride);
+    add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29],
+                         out[30], out[31], outputT + (24 * stride), stride);
+  }
+}
+
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  int i;
+  int16_t temp[32 * 8];
+  int16_t *t = temp;
+
+  vpx_idct32_6_neon(input, t);
+
+  for (i = 0; i < 32; i += 8) {
+    vpx_idct32_8_neon(t, dest, stride, 0);
+    t += (8 * 8);
+    dest += 8;
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm
deleted file mode 100644
index 7483ee77e1..0000000000
--- a/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm
+++ /dev/null
@@ -1,1299 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-;TODO(cd): adjust these constant to be able to use vqdmulh for faster
-;          dct_const_round_shift(a * b) within butterfly calculations.
-cospi_1_64  EQU 16364
-cospi_2_64  EQU 16305
-cospi_3_64  EQU 16207
-cospi_4_64  EQU 16069
-cospi_5_64  EQU 15893
-cospi_6_64  EQU 15679
-cospi_7_64  EQU 15426
-cospi_8_64  EQU 15137
-cospi_9_64  EQU 14811
-cospi_10_64 EQU 14449
-cospi_11_64 EQU 14053
-cospi_12_64 EQU 13623
-cospi_13_64 EQU 13160
-cospi_14_64 EQU 12665
-cospi_15_64 EQU 12140
-cospi_16_64 EQU 11585
-cospi_17_64 EQU 11003
-cospi_18_64 EQU 10394
-cospi_19_64 EQU  9760
-cospi_20_64 EQU  9102
-cospi_21_64 EQU  8423
-cospi_22_64 EQU  7723
-cospi_23_64 EQU  7005
-cospi_24_64 EQU  6270
-cospi_25_64 EQU  5520
-cospi_26_64 EQU  4756
-cospi_27_64 EQU  3981
-cospi_28_64 EQU  3196
-cospi_29_64 EQU  2404
-cospi_30_64 EQU  1606
-cospi_31_64 EQU   804
-
-
-    EXPORT  |vpx_idct32x32_1024_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    AREA     Block, CODE, READONLY
-
-    ; --------------------------------------------------------------------------
-    ; Load from transposed_buffer
-    ;   q13 = transposed_buffer[first_offset]
-    ;   q14 = transposed_buffer[second_offset]
-    ;   for proper address calculation, the last offset used when manipulating
-    ;   transposed_buffer must be passed in. use 0 for first use.
-    MACRO
-    LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
-    ; address calculation with proper stride and loading
-    add r0, #($first_offset  - $prev_offset )*8*2
-    vld1.s16        {q14}, [r0]
-    add r0, #($second_offset - $first_offset)*8*2
-    vld1.s16        {q13}, [r0]
-    ; (used) two registers (q14, q13)
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Load from output (used as temporary storage)
-    ;   reg1 = output[first_offset]
-    ;   reg2 = output[second_offset]
-    ;   for proper address calculation, the last offset used when manipulating
-    ;   output, whether reading or storing) must be passed in. use 0 for first
-    ;   use.
-    MACRO
-    LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
-    ; address calculation with proper stride and loading
-    add r1, #($first_offset  - $prev_offset )*32*2
-    vld1.s16        {$reg1}, [r1]
-    add r1, #($second_offset - $first_offset)*32*2
-    vld1.s16        {$reg2}, [r1]
-    ; (used) two registers ($reg1, $reg2)
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Store into output (sometimes as as temporary storage)
-    ;   output[first_offset] = reg1
-    ;   output[second_offset] = reg2
-    ;   for proper address calculation, the last offset used when manipulating
-    ;   output, whether reading or storing) must be passed in. use 0 for first
-    ;   use.
-    MACRO
-    STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
-    ; address calculation with proper stride and storing
-    add r1, #($first_offset  - $prev_offset )*32*2
-    vst1.16 {$reg1}, [r1]
-    add r1, #($second_offset - $first_offset)*32*2
-    vst1.16 {$reg2}, [r1]
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q6-q9 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_CENTER_RESULTS
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d8}, [r10], r2
-    vld1.s16        {d11}, [r9], r11
-    vld1.s16        {d9}, [r10]
-    vld1.s16        {d10}, [r9]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q8, q8, #6
-    vrshr.s16       q9, q9, #6
-    vrshr.s16       q6, q6, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q7, q7, d9
-    vaddw.u8        q8, q8, d10
-    vaddw.u8        q9, q9, d11
-    vaddw.u8        q6, q6, d8
-    ; clip pixel
-    vqmovun.s16     d9,  q7
-    vqmovun.s16     d10, q8
-    vqmovun.s16     d11, q9
-    vqmovun.s16     d8,  q6
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d9}, [r10], r11
-    vst1.16         {d10}, [r9], r2
-    vst1.16         {d8}, [r10]
-    vst1.16         {d11}, [r9]
-    ; update pointers (by dest_stride * 2)
-    sub r9,  r9,  r2, lsl #1
-    add r10, r10, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q6-q9 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_CENTER_RESULTS_LAST
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d8}, [r10], r2
-    vld1.s16        {d11}, [r9], r11
-    vld1.s16        {d9}, [r10]
-    vld1.s16        {d10}, [r9]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q8, q8, #6
-    vrshr.s16       q9, q9, #6
-    vrshr.s16       q6, q6, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q7, q7, d9
-    vaddw.u8        q8, q8, d10
-    vaddw.u8        q9, q9, d11
-    vaddw.u8        q6, q6, d8
-    ; clip pixel
-    vqmovun.s16     d9,  q7
-    vqmovun.s16     d10, q8
-    vqmovun.s16     d11, q9
-    vqmovun.s16     d8,  q6
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d9}, [r10], r11
-    vst1.16         {d10}, [r9], r2
-    vst1.16         {d8}, [r10]!
-    vst1.16         {d11}, [r9]!
-    ; update pointers (by dest_stride * 2)
-    sub r9,  r9,  r2, lsl #1
-    add r10, r10, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q4-q7 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_EXTREME_RESULTS
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d4}, [r7], r2
-    vld1.s16        {d7}, [r6], r11
-    vld1.s16        {d5}, [r7]
-    vld1.s16        {d6}, [r6]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q5, q5, #6
-    vrshr.s16       q6, q6, #6
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q4, q4, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q5, q5, d5
-    vaddw.u8        q6, q6, d6
-    vaddw.u8        q7, q7, d7
-    vaddw.u8        q4, q4, d4
-    ; clip pixel
-    vqmovun.s16     d5, q5
-    vqmovun.s16     d6, q6
-    vqmovun.s16     d7, q7
-    vqmovun.s16     d4, q4
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d5}, [r7], r11
-    vst1.16         {d6}, [r6], r2
-    vst1.16         {d7}, [r6]
-    vst1.16         {d4}, [r7]
-    ; update pointers (by dest_stride * 2)
-    sub r6, r6, r2, lsl #1
-    add r7, r7, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q4-q7 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_EXTREME_RESULTS_LAST
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d4}, [r7], r2
-    vld1.s16        {d7}, [r6], r11
-    vld1.s16        {d5}, [r7]
-    vld1.s16        {d6}, [r6]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q5, q5, #6
-    vrshr.s16       q6, q6, #6
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q4, q4, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q5, q5, d5
-    vaddw.u8        q6, q6, d6
-    vaddw.u8        q7, q7, d7
-    vaddw.u8        q4, q4, d4
-    ; clip pixel
-    vqmovun.s16     d5, q5
-    vqmovun.s16     d6, q6
-    vqmovun.s16     d7, q7
-    vqmovun.s16     d4, q4
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d5}, [r7], r11
-    vst1.16         {d6}, [r6], r2
-    vst1.16         {d7}, [r6]!
-    vst1.16         {d4}, [r7]!
-    ; update pointers (by dest_stride * 2)
-    sub r6, r6, r2, lsl #1
-    add r7, r7, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Touches q8-q12, q15 (q13-q14 are preserved)
-    ; valid output registers are anything but q8-q11
-    MACRO
-    DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
-    ; TODO(cd): have special case to re-use constants when they are similar for
-    ;           consecutive butterflies
-    ; TODO(cd): have special case when both constants are the same, do the
-    ;           additions/subtractions before the multiplies.
-    ; generate the constants
-    ;   generate scalar constants
-    mov             r8,  #$first_constant  & 0xFF00
-    mov             r12, #$second_constant & 0xFF00
-    add             r8,  #$first_constant  & 0x00FF
-    add             r12, #$second_constant & 0x00FF
-    ;   generate vector constants
-    vdup.16         d30, r8
-    vdup.16         d31, r12
-    ; (used) two for inputs (regA-regD), one for constants (q15)
-    ; do some multiplications (ordered for maximum latency hiding)
-    vmull.s16 q8,  $regC, d30
-    vmull.s16 q10, $regA, d31
-    vmull.s16 q9,  $regD, d30
-    vmull.s16 q11, $regB, d31
-    vmull.s16 q12, $regC, d31
-    ; (used) five for intermediate (q8-q12), one for constants (q15)
-    ; do some addition/subtractions (to get back two register)
-    vsub.s32  q8, q8, q10
-    vsub.s32  q9, q9, q11
-    ; do more multiplications (ordered for maximum latency hiding)
-    vmull.s16 q10, $regD, d31
-    vmull.s16 q11, $regA, d30
-    vmull.s16 q15, $regB, d30
-    ; (used) six for intermediate (q8-q12, q15)
-    ; do more addition/subtractions
-    vadd.s32  q11, q12, q11
-    vadd.s32  q10, q10, q15
-    ; (used) four for intermediate (q8-q11)
-    ; dct_const_round_shift
-    vqrshrn.s32 $reg1, q8,  #14
-    vqrshrn.s32 $reg2, q9,  #14
-    vqrshrn.s32 $reg3, q11, #14
-    vqrshrn.s32 $reg4, q10, #14
-    ; (used) two for results, well four d registers
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Touches q8-q12, q15 (q13-q14 are preserved)
-    ; valid output registers are anything but q8-q11
-    MACRO
-    DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
-    DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
-    MEND
-    ; --------------------------------------------------------------------------
-
-;void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
-;
-;   r0  int16_t *input,
-;   r1  uint8_t *dest,
-;   r2  int dest_stride)
-; loop counters
-;   r4  bands loop counter
-;   r5  pass loop counter
-;   r8  transpose loop counter
-; combine-add pointers
-;   r6  dest + 31 * dest_stride, descending (30, 29, 28, ...)
-;   r7  dest +  0 * dest_stride, ascending  (1, 2, 3, ...)
-;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)
-;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)
-
-|vpx_idct32x32_1024_add_neon| PROC
-    ; This function does one pass of idct32x32 transform.
-    ;
-    ; This is done by transposing the input and then doing a 1d transform on
-    ; columns. In the first pass, the transposed columns are the original
-    ; rows. In the second pass, after the transposition, the colums are the
-    ; original columns.
-    ; The 1d transform is done by looping over bands of eight columns (the
-    ; idct32_bands loop). For each band, the transform input transposition
-    ; is done on demand, one band of four 8x8 matrices at a time. The four
-    ; matrices are transposed by pairs (the idct32_transpose_pair loop).
-    push  {r4-r11}
-    vpush {d8-d15}
-    ; stack operation
-    ; internal buffer used to transpose 8 lines into before transforming them
-    ;   int16_t transpose_buffer[32 * 8];
-    ;   at sp + [4096, 4607]
-    ; results of the first pass (transpose and transform rows)
-    ;   int16_t pass1[32 * 32];
-    ;   at sp + [0, 2047]
-    ; results of the second pass (transpose and transform columns)
-    ;   int16_t pass2[32 * 32];
-    ;   at sp + [2048, 4095]
-    sub sp, sp, #512+2048+2048
-
-    ; r6  = dest + 31 * dest_stride
-    ; r7  = dest +  0 * dest_stride
-    ; r9  = dest + 15 * dest_stride
-    ; r10 = dest + 16 * dest_stride
-    rsb r6,  r2, r2, lsl #5
-    rsb r9,  r2, r2, lsl #4
-    add r10, r1, r2, lsl #4
-    mov r7, r1
-    add r6, r6, r1
-    add r9, r9, r1
-    ; r11 = -dest_stride
-    neg r11, r2
-    ; r3 = input
-    mov r3, r0
-    ; parameters for first pass
-      ; r0 = transpose_buffer[32 * 8]
-    add r0, sp, #4096
-      ; r1 = pass1[32 * 32]
-    mov r1, sp
-
-    mov r5, #0          ; initialize pass loop counter
-idct32_pass_loop
-    mov r4, #4          ; initialize bands loop counter
-idct32_bands_loop
-    mov r8, #2          ; initialize transpose loop counter
-idct32_transpose_pair_loop
-    ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
-    ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
-    ; adjusted to 32 because of the two post-increments.
-    vld1.s16        {q8},  [r3]!
-    vld1.s16        {q0},  [r3]!
-    add r3, #32
-    vld1.s16        {q9},  [r3]!
-    vld1.s16        {q1},  [r3]!
-    add r3, #32
-    vld1.s16        {q10}, [r3]!
-    vld1.s16        {q2},  [r3]!
-    add r3, #32
-    vld1.s16        {q11}, [r3]!
-    vld1.s16        {q3},  [r3]!
-    add r3, #32
-    vld1.s16        {q12}, [r3]!
-    vld1.s16        {q4},  [r3]!
-    add r3, #32
-    vld1.s16        {q13}, [r3]!
-    vld1.s16        {q5},  [r3]!
-    add r3, #32
-    vld1.s16        {q14}, [r3]!
-    vld1.s16        {q6},  [r3]!
-    add r3, #32
-    vld1.s16        {q15}, [r3]!
-    vld1.s16        {q7},  [r3]!
-
-    ; Transpose the two 8x8 16bit data matrices.
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vswp            d1,  d8
-    vswp            d7,  d14
-    vswp            d5,  d12
-    vswp            d3,  d10
-    vtrn.32         q8,  q10
-    vtrn.32         q9,  q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.32         q0,  q2
-    vtrn.32         q1,  q3
-    vtrn.32         q4,  q6
-    vtrn.32         q5,  q7
-    vtrn.16         q8,  q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    vtrn.16         q0,  q1
-    vtrn.16         q2,  q3
-    vtrn.16         q4,  q5
-    vtrn.16         q6,  q7
-
-    ; Store both matrices after each other. There is a stride of 32, which
-    ; adjusts to nothing because of the post-increments.
-    vst1.16        {q8},  [r0]!
-    vst1.16        {q9},  [r0]!
-    vst1.16        {q10}, [r0]!
-    vst1.16        {q11}, [r0]!
-    vst1.16        {q12}, [r0]!
-    vst1.16        {q13}, [r0]!
-    vst1.16        {q14}, [r0]!
-    vst1.16        {q15}, [r0]!
-    vst1.16        {q0},  [r0]!
-    vst1.16        {q1},  [r0]!
-    vst1.16        {q2},  [r0]!
-    vst1.16        {q3},  [r0]!
-    vst1.16        {q4},  [r0]!
-    vst1.16        {q5},  [r0]!
-    vst1.16        {q6},  [r0]!
-    vst1.16        {q7},  [r0]!
-
-    ; increment pointers by adjusted stride (not necessary for r0/out)
-    ;   go back by 7*32 for the seven lines moved fully by read and add
-    ;   go back by 32 for the eigth line only read
-    ;   advance by 16*2 to go the next pair
-    sub r3,  r3,  #7*32*2 + 32 - 16*2
-    ; transpose pair loop processing
-    subs r8, r8, #1
-    bne idct32_transpose_pair_loop
-
-    ; restore r0/input to its original value
-    sub r0, r0, #32*8*2
-
-    ; Instead of doing the transforms stage by stage, it is done by loading
-    ; some input values and doing as many stages as possible to minimize the
-    ; storing/loading of intermediate results. To fit within registers, the
-    ; final coefficients are cut into four blocks:
-    ; BLOCK A: 16-19,28-31
-    ; BLOCK B: 20-23,24-27
-    ; BLOCK C: 8-10,11-15
-    ; BLOCK D: 0-3,4-7
-    ; Blocks A and C are straight calculation through the various stages. In
-    ; block B, further calculations are performed using the results from
-    ; block A. In block D, further calculations are performed using the results
-    ; from block C and then the final calculations are done using results from
-    ; block A and B which have been combined at the end of block B.
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK A: 16-19,28-31
-    ; --------------------------------------------------------------------------
-    ; generate 16,17,30,31
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] *  cospi_1_64;
-    ;temp2 = input[1 * 32] *  cospi_1_64 + input[31 * 32] * cospi_31_64;
-    ;step1b[16][i] = dct_const_round_shift(temp1);
-    ;step1b[31][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 0, 1, 31
-    DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
-    ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
-    ;step1b[17][i] = dct_const_round_shift(temp1);
-    ;step1b[30][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 31, 17, 15
-    DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[16] =  step1b[16][i] + step1b[17][i];
-    ;step2[17] =  step1b[16][i] - step1b[17][i];
-    ;step2[30] = -step1b[30][i] + step1b[31][i];
-    ;step2[31] =  step1b[30][i] + step1b[31][i];
-    vadd.s16  q4, q0, q1
-    vsub.s16  q13, q0, q1
-    vadd.s16  q6, q2, q3
-    vsub.s16  q14, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
-    ;temp2 = step1b[30][i] * cospi_4_64  - step1b[17][i] * cospi_28_64;
-    ;step3[17] = dct_const_round_shift(temp1);
-    ;step3[30] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; generate 18,19,28,29
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
-    ;temp2 = input[9 * 32] *  cospi_9_64 + input[23 * 32] * cospi_23_64;
-    ;step1b[18][i] = dct_const_round_shift(temp1);
-    ;step1b[29][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 15, 9, 23
-    DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[25 * 32] *  cospi_7_64 - input[7 * 32] * cospi_25_64;
-    ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
-    ;step1b[19][i] = dct_const_round_shift(temp1);
-    ;step1b[28][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 23, 25, 7
-    DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[18] = -step1b[18][i] + step1b[19][i];
-    ;step2[19] =  step1b[18][i] + step1b[19][i];
-    ;step2[28] =  step1b[28][i] + step1b[29][i];
-    ;step2[29] =  step1b[28][i] - step1b[29][i];
-    vsub.s16  q13, q3, q2
-    vadd.s16  q3,  q3, q2
-    vsub.s16  q14, q1, q0
-    vadd.s16  q2,  q1, q0
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[18][i] * (-cospi_4_64)  - step1b[29][i] * (-cospi_28_64);
-    ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
-    ;step3[29] = dct_const_round_shift(temp1);
-    ;step3[18] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
-    ; --------------------------------------------------------------------------
-    ; combine 16-19,28-31
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[16] = step1b[16][i] + step1b[19][i];
-    ;step1[17] = step1b[17][i] + step1b[18][i];
-    ;step1[18] = step1b[17][i] - step1b[18][i];
-    ;step1[29] = step1b[30][i] - step1b[29][i];
-    ;step1[30] = step1b[30][i] + step1b[29][i];
-    ;step1[31] = step1b[31][i] + step1b[28][i];
-    vadd.s16  q8,  q4, q2
-    vadd.s16  q9,  q5, q0
-    vadd.s16  q10, q7, q1
-    vadd.s16  q15, q6, q3
-    vsub.s16  q13, q5, q0
-    vsub.s16  q14, q7, q1
-    STORE_IN_OUTPUT 0,  16, 31, q8,  q15
-    STORE_IN_OUTPUT 31, 17, 30, q9,  q10
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
-    ;temp2 = step1b[29][i] * cospi_8_64  + step1b[18][i] * cospi_24_64;
-    ;step2[18] = dct_const_round_shift(temp1);
-    ;step2[29] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
-    STORE_IN_OUTPUT 30, 29, 18, q1, q0
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[19] = step1b[16][i] - step1b[19][i];
-    ;step1[28] = step1b[31][i] - step1b[28][i];
-    vsub.s16  q13, q4, q2
-    vsub.s16  q14, q6, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
-    ;temp2 = step1b[28][i] * cospi_8_64  + step1b[19][i] * cospi_24_64;
-    ;step2[19] = dct_const_round_shift(temp1);
-    ;step2[28] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
-    STORE_IN_OUTPUT 18, 19, 28, q4, q6
-    ; --------------------------------------------------------------------------
-
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK B: 20-23,24-27
-    ; --------------------------------------------------------------------------
-    ; generate 20,21,26,27
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
-    ;temp2 = input[5 * 32] *  cospi_5_64 + input[27 * 32] * cospi_27_64;
-    ;step1b[20][i] = dct_const_round_shift(temp1);
-    ;step1b[27][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 7, 5, 27
-    DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
-    ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
-    ;step1b[21][i] = dct_const_round_shift(temp1);
-    ;step1b[26][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 27, 21, 11
-    DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[20] =  step1b[20][i] + step1b[21][i];
-    ;step2[21] =  step1b[20][i] - step1b[21][i];
-    ;step2[26] = -step1b[26][i] + step1b[27][i];
-    ;step2[27] =  step1b[26][i] + step1b[27][i];
-    vsub.s16  q13, q0, q1
-    vadd.s16  q0, q0, q1
-    vsub.s16  q14, q2, q3
-    vadd.s16  q2, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
-    ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
-    ;step3[21] = dct_const_round_shift(temp1);
-    ;step3[26] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; generate 22,23,24,25
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
-    ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
-    ;step1b[22][i] = dct_const_round_shift(temp1);
-    ;step1b[25][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 11, 13, 19
-    DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[29 * 32] *  cospi_3_64 - input[3 * 32] * cospi_29_64;
-    ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
-    ;step1b[23][i] = dct_const_round_shift(temp1);
-    ;step1b[24][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 19, 29, 3
-    DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[22] = -step1b[22][i] + step1b[23][i];
-    ;step2[23] =  step1b[22][i] + step1b[23][i];
-    ;step2[24] =  step1b[24][i] + step1b[25][i];
-    ;step2[25] =  step1b[24][i] - step1b[25][i];
-    vsub.s16  q14, q4, q5
-    vadd.s16  q5, q4, q5
-    vsub.s16  q13, q6, q7
-    vadd.s16  q6, q6, q7
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
-    ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
-    ;step3[25] = dct_const_round_shift(temp1);
-    ;step3[22] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
-    ; --------------------------------------------------------------------------
-    ; combine 20-23,24-27
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[22] = step1b[22][i] + step1b[21][i];
-    ;step1[23] = step1b[23][i] + step1b[20][i];
-    vadd.s16  q10, q7, q1
-    vadd.s16  q11, q5, q0
-    ;step1[24] = step1b[24][i] + step1b[27][i];
-    ;step1[25] = step1b[25][i] + step1b[26][i];
-    vadd.s16  q12, q6, q2
-    vadd.s16  q15, q4, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[16] = step1b[16][i] + step1b[23][i];
-    ;step3[17] = step1b[17][i] + step1b[22][i];
-    ;step3[22] = step1b[17][i] - step1b[22][i];
-    ;step3[23] = step1b[16][i] - step1b[23][i];
-    LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
-    vadd.s16  q8,  q14, q11
-    vadd.s16  q9,  q13, q10
-    vsub.s16  q13, q13, q10
-    vsub.s16  q11, q14, q11
-    STORE_IN_OUTPUT 17, 17, 16, q9, q8
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[24] = step1b[31][i] - step1b[24][i];
-    ;step3[25] = step1b[30][i] - step1b[25][i];
-    ;step3[30] = step1b[30][i] + step1b[25][i];
-    ;step3[31] = step1b[31][i] + step1b[24][i];
-    LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
-    vsub.s16  q8,  q9,  q12
-    vadd.s16  q10, q14, q15
-    vsub.s16  q14, q14, q15
-    vadd.s16  q12, q9,  q12
-    STORE_IN_OUTPUT 31, 30, 31, q10, q12
-    ; --------------------------------------------------------------------------
-    ; TODO(cd) do some register allocation change to remove these push/pop
-    vpush {q8}  ; [24]
-    vpush {q11} ; [23]
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
-    ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
-    ;step1[22] = dct_const_round_shift(temp1);
-    ;step1[25] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
-    STORE_IN_OUTPUT 31, 25, 22, q14, q13
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
-    ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
-    ;step1[23] = dct_const_round_shift(temp1);
-    ;step1[24] = dct_const_round_shift(temp2);
-    ; TODO(cd) do some register allocation change to remove these push/pop
-    vpop  {q13} ; [23]
-    vpop  {q14} ; [24]
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
-    STORE_IN_OUTPUT 22, 24, 23, q14, q13
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[20] = step1b[23][i] - step1b[20][i];
-    ;step1[27] = step1b[24][i] - step1b[27][i];
-    vsub.s16  q14, q5, q0
-    vsub.s16  q13, q6, q2
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[20][i] * (-cospi_8_64)  - step1b[27][i] * (-cospi_24_64);
-    ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
-    ;step2[27] = dct_const_round_shift(temp1);
-    ;step2[20] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[21] = step1b[22][i] - step1b[21][i];
-    ;step1[26] = step1b[25][i] - step1b[26][i];
-    vsub.s16  q14,  q7, q1
-    vsub.s16  q13,  q4, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[21][i] * (-cospi_8_64)  - step1b[26][i] * (-cospi_24_64);
-    ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
-    ;step2[26] = dct_const_round_shift(temp1);
-    ;step2[21] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[18] = step1b[18][i] + step1b[21][i];
-    ;step3[19] = step1b[19][i] + step1b[20][i];
-    ;step3[20] = step1b[19][i] - step1b[20][i];
-    ;step3[21] = step1b[18][i] - step1b[21][i];
-    LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
-    vadd.s16  q8,  q14, q1
-    vadd.s16  q9,  q13, q6
-    vsub.s16  q13, q13, q6
-    vsub.s16  q1,  q14, q1
-    STORE_IN_OUTPUT 19, 18, 19, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[27] = step1b[28][i] - step1b[27][i];
-    ;step3[28] = step1b[28][i] + step1b[27][i];
-    ;step3[29] = step1b[29][i] + step1b[26][i];
-    ;step3[26] = step1b[29][i] - step1b[26][i];
-    LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
-    vsub.s16  q14, q8, q5
-    vadd.s16  q10, q8, q5
-    vadd.s16  q11, q9, q0
-    vsub.s16  q0, q9, q0
-    STORE_IN_OUTPUT 29, 28, 29, q10, q11
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
-    ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
-    ;step1[20] = dct_const_round_shift(temp1);
-    ;step1[27] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
-    STORE_IN_OUTPUT 29, 20, 27, q13, q14
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
-    ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
-    ;step1[21] = dct_const_round_shift(temp1);
-    ;step1[26] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
-    STORE_IN_OUTPUT 27, 21, 26, q1, q0
-    ; --------------------------------------------------------------------------
-
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK C: 8-10,11-15
-    ; --------------------------------------------------------------------------
-    ; generate 8,9,14,15
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
-    ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
-    ;step2[8] = dct_const_round_shift(temp1);
-    ;step2[15] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 3, 2, 30
-    DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
-    ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
-    ;step2[9] = dct_const_round_shift(temp1);
-    ;step2[14] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 30, 18, 14
-    DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;step3[8] = step1b[8][i] + step1b[9][i];
-    ;step3[9] = step1b[8][i] - step1b[9][i];
-    ;step3[14] = step1b[15][i] - step1b[14][i];
-    ;step3[15] = step1b[15][i] + step1b[14][i];
-    vsub.s16  q13, q0, q1
-    vadd.s16  q0, q0, q1
-    vsub.s16  q14, q2, q3
-    vadd.s16  q2, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
-    ;temp2 = step1b[14][i] * cospi_8_64  + step1b[9][i] * cospi_24_64;
-    ;step1[9]  = dct_const_round_shift(temp1);
-    ;step1[14] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; generate 10,11,12,13
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
-    ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
-    ;step2[10] = dct_const_round_shift(temp1);
-    ;step2[13] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 14, 10, 22
-    DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
-    ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
-    ;step2[11] = dct_const_round_shift(temp1);
-    ;step2[12] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 22, 26, 6
-    DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;step3[10] = step1b[11][i] - step1b[10][i];
-    ;step3[11] = step1b[11][i] + step1b[10][i];
-    ;step3[12] = step1b[12][i] + step1b[13][i];
-    ;step3[13] = step1b[12][i] - step1b[13][i];
-    vsub.s16  q14, q4, q5
-    vadd.s16  q5, q4, q5
-    vsub.s16  q13, q6, q7
-    vadd.s16  q6, q6, q7
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = step1b[10][i] * (-cospi_8_64)  - step1b[13][i] * (-cospi_24_64);
-    ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
-    ;step1[13] = dct_const_round_shift(temp1);
-    ;step1[10] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
-    ; --------------------------------------------------------------------------
-    ; combine 8-10,11-15
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;step2[8]  = step1b[8][i] + step1b[11][i];
-    ;step2[9]  = step1b[9][i] + step1b[10][i];
-    ;step2[10] = step1b[9][i] - step1b[10][i];
-    vadd.s16  q8,  q0, q5
-    vadd.s16  q9,  q1, q7
-    vsub.s16  q13, q1, q7
-    ;step2[13] = step1b[14][i] - step1b[13][i];
-    ;step2[14] = step1b[14][i] + step1b[13][i];
-    ;step2[15] = step1b[15][i] + step1b[12][i];
-    vsub.s16  q14, q3, q4
-    vadd.s16  q10, q3, q4
-    vadd.s16  q15, q2, q6
-    STORE_IN_OUTPUT 26, 8, 15, q8, q15
-    STORE_IN_OUTPUT 15, 9, 14, q9, q10
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
-    ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
-    ;step3[10] = dct_const_round_shift(temp1);
-    ;step3[13] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
-    STORE_IN_OUTPUT 14, 13, 10, q3, q1
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;step2[11] = step1b[8][i] - step1b[11][i];
-    ;step2[12] = step1b[15][i] - step1b[12][i];
-    vsub.s16  q13, q0, q5
-    vsub.s16  q14,  q2, q6
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
-    ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
-    ;step3[11] = dct_const_round_shift(temp1);
-    ;step3[12] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
-    STORE_IN_OUTPUT 10, 11, 12, q1, q3
-    ; --------------------------------------------------------------------------
-
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK D: 0-3,4-7
-    ; --------------------------------------------------------------------------
-    ; generate 4,5,6,7
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
-    ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
-    ;step3[4] = dct_const_round_shift(temp1);
-    ;step3[7] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 6, 4, 28
-    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
-    ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
-    ;step3[5] = dct_const_round_shift(temp1);
-    ;step3[6] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 28, 20, 12
-    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[4] = step1b[4][i] + step1b[5][i];
-    ;step1[5] = step1b[4][i] - step1b[5][i];
-    ;step1[6] = step1b[7][i] - step1b[6][i];
-    ;step1[7] = step1b[7][i] + step1b[6][i];
-    vsub.s16  q13, q0, q1
-    vadd.s16  q0, q0, q1
-    vsub.s16  q14, q2, q3
-    vadd.s16  q2, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
-    ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
-    ;step2[5] = dct_const_round_shift(temp1);
-    ;step2[6] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; generate 0,1,2,3
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
-    ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
-    ;step1[1] = dct_const_round_shift(temp1);
-    ;step1[0] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 12, 0, 16
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
-    ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
-    ;step1[2] = dct_const_round_shift(temp1);
-    ;step1[3] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 16, 8, 24
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;step2[0] = step1b[0][i] + step1b[3][i];
-    ;step2[1] = step1b[1][i] + step1b[2][i];
-    ;step2[2] = step1b[1][i] - step1b[2][i];
-    ;step2[3] = step1b[0][i] - step1b[3][i];
-    vadd.s16  q4, q7, q6
-    vsub.s16  q7, q7, q6
-    vsub.s16  q6, q5, q14
-    vadd.s16  q5, q5, q14
-    ; --------------------------------------------------------------------------
-    ; combine 0-3,4-7
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[0] = step1b[0][i] + step1b[7][i];
-    ;step3[1] = step1b[1][i] + step1b[6][i];
-    ;step3[2] = step1b[2][i] + step1b[5][i];
-    ;step3[3] = step1b[3][i] + step1b[4][i];
-    vadd.s16  q8,  q4, q2
-    vadd.s16  q9,  q5, q3
-    vadd.s16  q10, q6, q1
-    vadd.s16  q11, q7, q0
-    ;step3[4] = step1b[3][i] - step1b[4][i];
-    ;step3[5] = step1b[2][i] - step1b[5][i];
-    ;step3[6] = step1b[1][i] - step1b[6][i];
-    ;step3[7] = step1b[0][i] - step1b[7][i];
-    vsub.s16  q12, q7, q0
-    vsub.s16  q13, q6, q1
-    vsub.s16  q14, q5, q3
-    vsub.s16  q15, q4, q2
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[0] = step1b[0][i] + step1b[15][i];
-    ;step1[1] = step1b[1][i] + step1b[14][i];
-    ;step1[14] = step1b[1][i] - step1b[14][i];
-    ;step1[15] = step1b[0][i] - step1b[15][i];
-    LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
-    vadd.s16  q2, q8, q1
-    vadd.s16  q3, q9, q0
-    vsub.s16  q4, q9, q0
-    vsub.s16  q5, q8, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[14 * 32] = step1b[14][i] + step1b[17][i];
-    ;output[15 * 32] = step1b[15][i] + step1b[16][i];
-    ;output[16 * 32] = step1b[15][i] - step1b[16][i];
-    ;output[17 * 32] = step1b[14][i] - step1b[17][i];
-    LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-
-    cmp r5, #0
-    bgt idct32_bands_end_2nd_pass
-
-idct32_bands_end_1st_pass
-    STORE_IN_OUTPUT 17, 16, 17, q6, q7
-    STORE_IN_OUTPUT 17, 14, 15, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
-    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
-    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
-    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
-    LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 31, 30, 31, q6, q7
-    STORE_IN_OUTPUT 31,  0,  1, q4, q5
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[2] = step1b[2][i] + step1b[13][i];
-    ;step1[3] = step1b[3][i] + step1b[12][i];
-    ;step1[12] = step1b[3][i] - step1b[12][i];
-    ;step1[13] = step1b[2][i] - step1b[13][i];
-    LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
-    vadd.s16  q2, q10, q1
-    vadd.s16  q3, q11, q0
-    vsub.s16  q4, q11, q0
-    vsub.s16  q5, q10, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
-    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
-    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
-    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
-    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_IN_OUTPUT 19, 18, 19, q6, q7
-    STORE_IN_OUTPUT 19, 12, 13, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
-    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
-    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
-    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
-    LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 29, 28, 29, q6, q7
-    STORE_IN_OUTPUT 29,  2,  3, q4, q5
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[4] = step1b[4][i] + step1b[11][i];
-    ;step1[5] = step1b[5][i] + step1b[10][i];
-    ;step1[10] = step1b[5][i] - step1b[10][i];
-    ;step1[11] = step1b[4][i] - step1b[11][i];
-    LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
-    vadd.s16  q2, q12, q1
-    vadd.s16  q3, q13, q0
-    vsub.s16  q4, q13, q0
-    vsub.s16  q5, q12, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
-    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
-    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
-    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
-    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_IN_OUTPUT 21, 20, 21, q6, q7
-    STORE_IN_OUTPUT 21, 10, 11, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
-    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
-    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
-    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
-    LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 27, 26, 27, q6, q7
-    STORE_IN_OUTPUT 27,  4,  5, q4, q5
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[6] = step1b[6][i] + step1b[9][i];
-    ;step1[7] = step1b[7][i] + step1b[8][i];
-    ;step1[8] = step1b[7][i] - step1b[8][i];
-    ;step1[9] = step1b[6][i] - step1b[9][i];
-    LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
-    vadd.s16  q2, q14, q1
-    vadd.s16  q3, q15, q0
-    vsub.s16  q4, q15, q0
-    vsub.s16  q5, q14, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
-    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
-    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
-    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
-    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_IN_OUTPUT 23, 22, 23, q6, q7
-    STORE_IN_OUTPUT 23, 8, 9, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
-    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
-    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
-    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
-    LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 25, 24, 25, q6, q7
-    STORE_IN_OUTPUT 25,  6,  7, q4, q5
-
-    ; restore r0 by removing the last offset from the last
-    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
-    sub r0, r0, #24*8*2
-    ; restore r1 by removing the last offset from the last
-    ;     operation (STORE_IN_OUTPUT 24,  6,  7) => 7*32*2
-    ; advance by 8 columns => 8*2
-    sub r1, r1, #7*32*2 - 8*2
-    ;   advance by 8 lines (8*32*2)
-    ;   go back by the two pairs from the loop (32*2)
-    add r3, r3, #8*32*2 - 32*2
-
-    ; bands loop processing
-    subs r4, r4, #1
-    bne idct32_bands_loop
-
-    ; parameters for second pass
-    ; the input of pass2 is the result of pass1. we have to remove the offset
-    ;   of 32 columns induced by the above idct32_bands_loop
-    sub r3, r1, #32*2
-      ; r1 = pass2[32 * 32]
-    add r1, sp, #2048
-
-    ; pass loop processing
-    add r5, r5, #1
-    b idct32_pass_loop
-
-idct32_bands_end_2nd_pass
-    STORE_COMBINE_CENTER_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
-    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
-    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
-    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
-    LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[2] = step1b[2][i] + step1b[13][i];
-    ;step1[3] = step1b[3][i] + step1b[12][i];
-    ;step1[12] = step1b[3][i] - step1b[12][i];
-    ;step1[13] = step1b[2][i] - step1b[13][i];
-    LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
-    vadd.s16  q2, q10, q1
-    vadd.s16  q3, q11, q0
-    vsub.s16  q4, q11, q0
-    vsub.s16  q5, q10, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
-    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
-    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
-    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
-    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_COMBINE_CENTER_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
-    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
-    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
-    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
-    LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[4] = step1b[4][i] + step1b[11][i];
-    ;step1[5] = step1b[5][i] + step1b[10][i];
-    ;step1[10] = step1b[5][i] - step1b[10][i];
-    ;step1[11] = step1b[4][i] - step1b[11][i];
-    LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
-    vadd.s16  q2, q12, q1
-    vadd.s16  q3, q13, q0
-    vsub.s16  q4, q13, q0
-    vsub.s16  q5, q12, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
-    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
-    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
-    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
-    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_COMBINE_CENTER_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
-    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
-    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
-    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
-    LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[6] = step1b[6][i] + step1b[9][i];
-    ;step1[7] = step1b[7][i] + step1b[8][i];
-    ;step1[8] = step1b[7][i] - step1b[8][i];
-    ;step1[9] = step1b[6][i] - step1b[9][i];
-    LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
-    vadd.s16  q2, q14, q1
-    vadd.s16  q3, q15, q0
-    vsub.s16  q4, q15, q0
-    vsub.s16  q5, q14, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
-    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
-    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
-    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
-    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_COMBINE_CENTER_RESULTS_LAST
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
-    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
-    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
-    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
-    LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS_LAST
-    ; --------------------------------------------------------------------------
-    ; restore pointers to their initial indices for next band pass by
-    ;     removing/adding dest_stride * 8. The actual increment by eight
-    ;     is taken care of within the _LAST macros.
-    add r6,  r6,  r2, lsl #3
-    add r9,  r9,  r2, lsl #3
-    sub r7,  r7,  r2, lsl #3
-    sub r10, r10, r2, lsl #3
-
-    ; restore r0 by removing the last offset from the last
-    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
-    sub r0, r0, #24*8*2
-    ; restore r1 by removing the last offset from the last
-    ;     operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
-    ; advance by 8 columns => 8*2
-    sub r1, r1, #25*32*2 - 8*2
-    ;   advance by 8 lines (8*32*2)
-    ;   go back by the two pairs from the loop (32*2)
-    add r3, r3, #8*32*2 - 32*2
-
-    ; bands loop processing
-    subs r4, r4, #1
-    bne idct32_bands_loop
-
-    ; stack operation
-    add sp, sp, #512+2048+2048
-    vpop {d8-d15}
-    pop  {r4-r11}
-    bx              lr
-    ENDP  ; |vpx_idct32x32_1024_add_neon|
-    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
index c4d1e8473c..9f4589ea96 100644
--- a/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
@@ -11,630 +11,766 @@
 #include <arm_neon.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-#define LOAD_FROM_TRANSPOSED(prev, first, second) \
-  q14s16 = vld1q_s16(trans_buf + first * 8);      \
-  q13s16 = vld1q_s16(trans_buf + second * 8);
+static INLINE void load_from_transformed(const int16_t *const trans_buf,
+                                         const int first, const int second,
+                                         int16x8_t *const q0,
+                                         int16x8_t *const q1) {
+  *q0 = vld1q_s16(trans_buf + first * 8);
+  *q1 = vld1q_s16(trans_buf + second * 8);
+}
 
-#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
-  qA = vld1q_s16(out + first * 32);                   \
-  qB = vld1q_s16(out + second * 32);
+static INLINE void load_from_output(const int16_t *const out, const int first,
+                                    const int second, int16x8_t *const q0,
+                                    int16x8_t *const q1) {
+  *q0 = vld1q_s16(out + first * 32);
+  *q1 = vld1q_s16(out + second * 32);
+}
 
-#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
-  vst1q_s16(out + first * 32, qA);                   \
-  vst1q_s16(out + second * 32, qB);
+static INLINE void store_in_output(int16_t *const out, const int first,
+                                   const int second, const int16x8_t q0,
+                                   const int16x8_t q1) {
+  vst1q_s16(out + first * 32, q0);
+  vst1q_s16(out + second * 32, q1);
+}
 
-#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
-  __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
-static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
-                                                  int stride, int16x8_t q6s16,
-                                                  int16x8_t q7s16,
-                                                  int16x8_t q8s16,
-                                                  int16x8_t q9s16) {
-  int16x4_t d8s16, d9s16, d10s16, d11s16;
+static INLINE void store_combine_results(uint8_t *p1, uint8_t *p2,
+                                         const int stride, int16x8_t q0,
+                                         int16x8_t q1, int16x8_t q2,
+                                         int16x8_t q3) {
+  uint8x8_t d[4];
 
-  d8s16 = vld1_s16((int16_t *)p1);
+  d[0] = vld1_u8(p1);
   p1 += stride;
-  d11s16 = vld1_s16((int16_t *)p2);
+  d[1] = vld1_u8(p1);
+  d[3] = vld1_u8(p2);
   p2 -= stride;
-  d9s16 = vld1_s16((int16_t *)p1);
-  d10s16 = vld1_s16((int16_t *)p2);
+  d[2] = vld1_u8(p2);
 
-  q7s16 = vrshrq_n_s16(q7s16, 6);
-  q8s16 = vrshrq_n_s16(q8s16, 6);
-  q9s16 = vrshrq_n_s16(q9s16, 6);
-  q6s16 = vrshrq_n_s16(q6s16, 6);
+  q0 = vrshrq_n_s16(q0, 6);
+  q1 = vrshrq_n_s16(q1, 6);
+  q2 = vrshrq_n_s16(q2, 6);
+  q3 = vrshrq_n_s16(q3, 6);
 
-  q7s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
-  q8s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
-  q9s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
-  q6s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
+  q0 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q0), d[0]));
+  q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), d[1]));
+  q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), d[2]));
+  q3 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q3), d[3]));
 
-  d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
-  d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
-  d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
-  d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+  d[0] = vqmovun_s16(q0);
+  d[1] = vqmovun_s16(q1);
+  d[2] = vqmovun_s16(q2);
+  d[3] = vqmovun_s16(q3);
 
-  vst1_s16((int16_t *)p1, d9s16);
+  vst1_u8(p1, d[1]);
   p1 -= stride;
-  vst1_s16((int16_t *)p2, d10s16);
+  vst1_u8(p1, d[0]);
+  vst1_u8(p2, d[2]);
   p2 += stride;
-  vst1_s16((int16_t *)p1, d8s16);
-  vst1_s16((int16_t *)p2, d11s16);
-  return;
+  vst1_u8(p2, d[3]);
 }
 
-#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
-  __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
-static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
-                                                   int stride, int16x8_t q4s16,
-                                                   int16x8_t q5s16,
-                                                   int16x8_t q6s16,
-                                                   int16x8_t q7s16) {
-  int16x4_t d4s16, d5s16, d6s16, d7s16;
+static INLINE void highbd_store_combine_results_bd8(uint16_t *p1, uint16_t *p2,
+                                                    const int stride,
+                                                    int16x8_t q0, int16x8_t q1,
+                                                    int16x8_t q2,
+                                                    int16x8_t q3) {
+  uint16x8_t d[4];
 
-  d4s16 = vld1_s16((int16_t *)p1);
+  d[0] = vld1q_u16(p1);
   p1 += stride;
-  d7s16 = vld1_s16((int16_t *)p2);
+  d[1] = vld1q_u16(p1);
+  d[3] = vld1q_u16(p2);
   p2 -= stride;
-  d5s16 = vld1_s16((int16_t *)p1);
-  d6s16 = vld1_s16((int16_t *)p2);
+  d[2] = vld1q_u16(p2);
 
-  q5s16 = vrshrq_n_s16(q5s16, 6);
-  q6s16 = vrshrq_n_s16(q6s16, 6);
-  q7s16 = vrshrq_n_s16(q7s16, 6);
-  q4s16 = vrshrq_n_s16(q4s16, 6);
+  q0 = vrshrq_n_s16(q0, 6);
+  q1 = vrshrq_n_s16(q1, 6);
+  q2 = vrshrq_n_s16(q2, 6);
+  q3 = vrshrq_n_s16(q3, 6);
 
-  q5s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
-  q6s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
-  q7s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
-  q4s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
+  q0 = vaddq_s16(q0, vreinterpretq_s16_u16(d[0]));
+  q1 = vaddq_s16(q1, vreinterpretq_s16_u16(d[1]));
+  q2 = vaddq_s16(q2, vreinterpretq_s16_u16(d[2]));
+  q3 = vaddq_s16(q3, vreinterpretq_s16_u16(d[3]));
 
-  d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
-  d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
-  d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
-  d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
+  d[0] = vmovl_u8(vqmovun_s16(q0));
+  d[1] = vmovl_u8(vqmovun_s16(q1));
+  d[2] = vmovl_u8(vqmovun_s16(q2));
+  d[3] = vmovl_u8(vqmovun_s16(q3));
 
-  vst1_s16((int16_t *)p1, d5s16);
+  vst1q_u16(p1, d[1]);
   p1 -= stride;
-  vst1_s16((int16_t *)p2, d6s16);
+  vst1q_u16(p1, d[0]);
+  vst1q_u16(p2, d[2]);
   p2 += stride;
-  vst1_s16((int16_t *)p2, d7s16);
-  vst1_s16((int16_t *)p1, d4s16);
-  return;
+  vst1q_u16(p2, d[3]);
 }
 
-#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
-  DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
-static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
-                                int16_t first_const, int16_t second_const,
-                                int16x8_t *qAs16, int16x8_t *qBs16) {
-  int16x4_t d30s16, d31s16;
-  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
-  int16x4_t dCs16, dDs16, dAs16, dBs16;
+static INLINE void do_butterfly(const int16x8_t qIn0, const int16x8_t qIn1,
+                                const int16_t first_const,
+                                const int16_t second_const,
+                                int16x8_t *const qOut0,
+                                int16x8_t *const qOut1) {
+  int32x4_t q[4];
+  int16x4_t d[6];
 
-  dCs16 = vget_low_s16(q14s16);
-  dDs16 = vget_high_s16(q14s16);
-  dAs16 = vget_low_s16(q13s16);
-  dBs16 = vget_high_s16(q13s16);
+  d[0] = vget_low_s16(qIn0);
+  d[1] = vget_high_s16(qIn0);
+  d[2] = vget_low_s16(qIn1);
+  d[3] = vget_high_s16(qIn1);
 
-  d30s16 = vdup_n_s16(first_const);
-  d31s16 = vdup_n_s16(second_const);
+  // Note: using v{mul, mla, mls}l_n_s16 here slows down 35% with gcc 4.9.
+  d[4] = vdup_n_s16(first_const);
+  d[5] = vdup_n_s16(second_const);
 
-  q8s32 = vmull_s16(dCs16, d30s16);
-  q10s32 = vmull_s16(dAs16, d31s16);
-  q9s32 = vmull_s16(dDs16, d30s16);
-  q11s32 = vmull_s16(dBs16, d31s16);
-  q12s32 = vmull_s16(dCs16, d31s16);
+  q[0] = vmull_s16(d[0], d[4]);
+  q[1] = vmull_s16(d[1], d[4]);
+  q[0] = vmlsl_s16(q[0], d[2], d[5]);
+  q[1] = vmlsl_s16(q[1], d[3], d[5]);
 
-  q8s32 = vsubq_s32(q8s32, q10s32);
-  q9s32 = vsubq_s32(q9s32, q11s32);
+  q[2] = vmull_s16(d[0], d[5]);
+  q[3] = vmull_s16(d[1], d[5]);
+  q[2] = vmlal_s16(q[2], d[2], d[4]);
+  q[3] = vmlal_s16(q[3], d[3], d[4]);
 
-  q10s32 = vmull_s16(dDs16, d31s16);
-  q11s32 = vmull_s16(dAs16, d30s16);
-  q15s32 = vmull_s16(dBs16, d30s16);
-
-  q11s32 = vaddq_s32(q12s32, q11s32);
-  q10s32 = vaddq_s32(q10s32, q15s32);
-
-  *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
-  *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
-  return;
+  *qOut0 = vcombine_s16(vrshrn_n_s32(q[0], DCT_CONST_BITS),
+                        vrshrn_n_s32(q[1], DCT_CONST_BITS));
+  *qOut1 = vcombine_s16(vrshrn_n_s32(q[2], DCT_CONST_BITS),
+                        vrshrn_n_s32(q[3], DCT_CONST_BITS));
 }
 
-static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) {
-  int16_t *in;
+static INLINE void load_s16x8q(const int16_t *in, int16x8_t *const s0,
+                               int16x8_t *const s1, int16x8_t *const s2,
+                               int16x8_t *const s3, int16x8_t *const s4,
+                               int16x8_t *const s5, int16x8_t *const s6,
+                               int16x8_t *const s7) {
+  *s0 = vld1q_s16(in);
+  in += 32;
+  *s1 = vld1q_s16(in);
+  in += 32;
+  *s2 = vld1q_s16(in);
+  in += 32;
+  *s3 = vld1q_s16(in);
+  in += 32;
+  *s4 = vld1q_s16(in);
+  in += 32;
+  *s5 = vld1q_s16(in);
+  in += 32;
+  *s6 = vld1q_s16(in);
+  in += 32;
+  *s7 = vld1q_s16(in);
+}
+
+static INLINE void transpose_and_store_s16_8x8(int16x8_t a0, int16x8_t a1,
+                                               int16x8_t a2, int16x8_t a3,
+                                               int16x8_t a4, int16x8_t a5,
+                                               int16x8_t a6, int16x8_t a7,
+                                               int16_t **out) {
+  transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  vst1q_s16(*out, a0);
+  *out += 8;
+  vst1q_s16(*out, a1);
+  *out += 8;
+  vst1q_s16(*out, a2);
+  *out += 8;
+  vst1q_s16(*out, a3);
+  *out += 8;
+  vst1q_s16(*out, a4);
+  *out += 8;
+  vst1q_s16(*out, a5);
+  *out += 8;
+  vst1q_s16(*out, a6);
+  *out += 8;
+  vst1q_s16(*out, a7);
+  *out += 8;
+}
+
+static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) {
   int i;
-  const int stride = 32;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
 
   for (i = 0; i < 4; i++, input += 8) {
-    in = input;
-    q8s16 = vld1q_s16(in);
-    in += stride;
-    q9s16 = vld1q_s16(in);
-    in += stride;
-    q10s16 = vld1q_s16(in);
-    in += stride;
-    q11s16 = vld1q_s16(in);
-    in += stride;
-    q12s16 = vld1q_s16(in);
-    in += stride;
-    q13s16 = vld1q_s16(in);
-    in += stride;
-    q14s16 = vld1q_s16(in);
-    in += stride;
-    q15s16 = vld1q_s16(in);
-
-    transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                      &q14s16, &q15s16);
-
-    vst1q_s16(t_buf, q8s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q9s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q10s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q11s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q12s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q13s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q14s16);
-    t_buf += 8;
-    vst1q_s16(t_buf, q15s16);
-    t_buf += 8;
+    load_s16x8q(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+    transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
   }
-  return;
 }
 
-static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
-                                             int16x8_t q3s16, int16x8_t q6s16,
-                                             int16x8_t q7s16, int16x8_t q8s16,
-                                             int16x8_t q9s16, int16x8_t q10s16,
-                                             int16x8_t q11s16, int16x8_t q12s16,
-                                             int16x8_t q13s16, int16x8_t q14s16,
-                                             int16x8_t q15s16) {
-  int16x8_t q0s16, q1s16, q4s16, q5s16;
-
-  STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
-  STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
-
-  LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
-  STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
-
-  LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
-  q2s16 = vaddq_s16(q10s16, q1s16);
-  q3s16 = vaddq_s16(q11s16, q0s16);
-  q4s16 = vsubq_s16(q11s16, q0s16);
-  q5s16 = vsubq_s16(q10s16, q1s16);
-
-  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
-  STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
-
-  LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
-  STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
-
-  LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
-  q2s16 = vaddq_s16(q12s16, q1s16);
-  q3s16 = vaddq_s16(q13s16, q0s16);
-  q4s16 = vsubq_s16(q13s16, q0s16);
-  q5s16 = vsubq_s16(q12s16, q1s16);
-
-  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
-  STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
-
-  LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
-  STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
-
-  LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
-  q2s16 = vaddq_s16(q14s16, q1s16);
-  q3s16 = vaddq_s16(q15s16, q0s16);
-  q4s16 = vsubq_s16(q15s16, q0s16);
-  q5s16 = vsubq_s16(q14s16, q1s16);
-
-  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
-  STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
-
-  LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
-  STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
-  return;
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void load_s16x8q_tran_low(
+    const tran_low_t *in, int16x8_t *const s0, int16x8_t *const s1,
+    int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4,
+    int16x8_t *const s5, int16x8_t *const s6, int16x8_t *const s7) {
+  *s0 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s1 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s2 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s3 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s4 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s5 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s6 = load_tran_low_to_s16q(in);
+  in += 32;
+  *s7 = load_tran_low_to_s16q(in);
 }
 
-static INLINE void idct32_bands_end_2nd_pass(
-    int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
-    int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
-    int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
-    int16x8_t q14s16, int16x8_t q15s16) {
-  uint8_t *r6 = dest + 31 * stride;
-  uint8_t *r7 = dest /* +  0 * stride*/;
-  uint8_t *r9 = dest + 15 * stride;
-  uint8_t *r10 = dest + 16 * stride;
-  int str2 = stride << 1;
-  int16x8_t q0s16, q1s16, q4s16, q5s16;
+static INLINE void idct32_transpose_pair_tran_low(const tran_low_t *input,
+                                                  int16_t *t_buf) {
+  int i;
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
 
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
-  r10 += str2;
-  r9 -= str2;
+  for (i = 0; i < 4; i++, input += 8) {
+    load_s16x8q_tran_low(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+    transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
+  }
+}
+#else  // !CONFIG_VP9_HIGHBITDEPTH
+#define idct32_transpose_pair_tran_low idct32_transpose_pair
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  r7 += str2;
-  r6 -= str2;
+static INLINE void idct32_bands_end_1st_pass(int16_t *const out,
+                                             int16x8_t *const q) {
+  store_in_output(out, 16, 17, q[6], q[7]);
+  store_in_output(out, 14, 15, q[8], q[9]);
 
-  LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
-  q2s16 = vaddq_s16(q10s16, q1s16);
-  q3s16 = vaddq_s16(q11s16, q0s16);
-  q4s16 = vsubq_s16(q11s16, q0s16);
-  q5s16 = vsubq_s16(q10s16, q1s16);
+  load_from_output(out, 30, 31, &q[0], &q[1]);
+  q[4] = vaddq_s16(q[2], q[1]);
+  q[5] = vaddq_s16(q[3], q[0]);
+  q[6] = vsubq_s16(q[3], q[0]);
+  q[7] = vsubq_s16(q[2], q[1]);
+  store_in_output(out, 30, 31, q[6], q[7]);
+  store_in_output(out, 0, 1, q[4], q[5]);
 
-  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
-  r10 += str2;
-  r9 -= str2;
+  load_from_output(out, 12, 13, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[10], q[1]);
+  q[3] = vaddq_s16(q[11], q[0]);
+  q[4] = vsubq_s16(q[11], q[0]);
+  q[5] = vsubq_s16(q[10], q[1]);
 
-  LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  r7 += str2;
-  r6 -= str2;
+  load_from_output(out, 18, 19, &q[0], &q[1]);
+  q[8] = vaddq_s16(q[4], q[1]);
+  q[9] = vaddq_s16(q[5], q[0]);
+  q[6] = vsubq_s16(q[5], q[0]);
+  q[7] = vsubq_s16(q[4], q[1]);
+  store_in_output(out, 18, 19, q[6], q[7]);
+  store_in_output(out, 12, 13, q[8], q[9]);
 
-  LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
-  q2s16 = vaddq_s16(q12s16, q1s16);
-  q3s16 = vaddq_s16(q13s16, q0s16);
-  q4s16 = vsubq_s16(q13s16, q0s16);
-  q5s16 = vsubq_s16(q12s16, q1s16);
+  load_from_output(out, 28, 29, &q[0], &q[1]);
+  q[4] = vaddq_s16(q[2], q[1]);
+  q[5] = vaddq_s16(q[3], q[0]);
+  q[6] = vsubq_s16(q[3], q[0]);
+  q[7] = vsubq_s16(q[2], q[1]);
+  store_in_output(out, 28, 29, q[6], q[7]);
+  store_in_output(out, 2, 3, q[4], q[5]);
 
-  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
-  r10 += str2;
-  r9 -= str2;
+  load_from_output(out, 10, 11, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[12], q[1]);
+  q[3] = vaddq_s16(q[13], q[0]);
+  q[4] = vsubq_s16(q[13], q[0]);
+  q[5] = vsubq_s16(q[12], q[1]);
 
-  LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  r7 += str2;
-  r6 -= str2;
+  load_from_output(out, 20, 21, &q[0], &q[1]);
+  q[8] = vaddq_s16(q[4], q[1]);
+  q[9] = vaddq_s16(q[5], q[0]);
+  q[6] = vsubq_s16(q[5], q[0]);
+  q[7] = vsubq_s16(q[4], q[1]);
+  store_in_output(out, 20, 21, q[6], q[7]);
+  store_in_output(out, 10, 11, q[8], q[9]);
 
-  LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
-  q2s16 = vaddq_s16(q14s16, q1s16);
-  q3s16 = vaddq_s16(q15s16, q0s16);
-  q4s16 = vsubq_s16(q15s16, q0s16);
-  q5s16 = vsubq_s16(q14s16, q1s16);
+  load_from_output(out, 26, 27, &q[0], &q[1]);
+  q[4] = vaddq_s16(q[2], q[1]);
+  q[5] = vaddq_s16(q[3], q[0]);
+  q[6] = vsubq_s16(q[3], q[0]);
+  q[7] = vsubq_s16(q[2], q[1]);
+  store_in_output(out, 26, 27, q[6], q[7]);
+  store_in_output(out, 4, 5, q[4], q[5]);
 
-  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
+  load_from_output(out, 8, 9, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[14], q[1]);
+  q[3] = vaddq_s16(q[15], q[0]);
+  q[4] = vsubq_s16(q[15], q[0]);
+  q[5] = vsubq_s16(q[14], q[1]);
 
-  LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  return;
+  load_from_output(out, 22, 23, &q[0], &q[1]);
+  q[8] = vaddq_s16(q[4], q[1]);
+  q[9] = vaddq_s16(q[5], q[0]);
+  q[6] = vsubq_s16(q[5], q[0]);
+  q[7] = vsubq_s16(q[4], q[1]);
+  store_in_output(out, 22, 23, q[6], q[7]);
+  store_in_output(out, 8, 9, q[8], q[9]);
+
+  load_from_output(out, 24, 25, &q[0], &q[1]);
+  q[4] = vaddq_s16(q[2], q[1]);
+  q[5] = vaddq_s16(q[3], q[0]);
+  q[6] = vsubq_s16(q[3], q[0]);
+  q[7] = vsubq_s16(q[2], q[1]);
+  store_in_output(out, 24, 25, q[6], q[7]);
+  store_in_output(out, 6, 7, q[4], q[5]);
 }
 
-void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) {
+static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
+                                             uint8_t *const dest,
+                                             const int stride,
+                                             int16x8_t *const q) {
+  uint8_t *dest0 = dest + 0 * stride;
+  uint8_t *dest1 = dest + 31 * stride;
+  uint8_t *dest2 = dest + 16 * stride;
+  uint8_t *dest3 = dest + 15 * stride;
+  const int str2 = stride << 1;
+
+  store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 30, 31, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 12, 13, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[10], q[1]);
+  q[3] = vaddq_s16(q[11], q[0]);
+  q[4] = vsubq_s16(q[11], q[0]);
+  q[5] = vsubq_s16(q[10], q[1]);
+
+  load_from_output(out, 18, 19, &q[0], &q[1]);
+  q[8] = final_add(q[4], q[1]);
+  q[9] = final_add(q[5], q[0]);
+  q[6] = final_sub(q[5], q[0]);
+  q[7] = final_sub(q[4], q[1]);
+  store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 28, 29, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 10, 11, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[12], q[1]);
+  q[3] = vaddq_s16(q[13], q[0]);
+  q[4] = vsubq_s16(q[13], q[0]);
+  q[5] = vsubq_s16(q[12], q[1]);
+
+  load_from_output(out, 20, 21, &q[0], &q[1]);
+  q[8] = final_add(q[4], q[1]);
+  q[9] = final_add(q[5], q[0]);
+  q[6] = final_sub(q[5], q[0]);
+  q[7] = final_sub(q[4], q[1]);
+  store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 26, 27, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 8, 9, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[14], q[1]);
+  q[3] = vaddq_s16(q[15], q[0]);
+  q[4] = vsubq_s16(q[15], q[0]);
+  q[5] = vsubq_s16(q[14], q[1]);
+
+  load_from_output(out, 22, 23, &q[0], &q[1]);
+  q[8] = final_add(q[4], q[1]);
+  q[9] = final_add(q[5], q[0]);
+  q[6] = final_sub(q[5], q[0]);
+  q[7] = final_sub(q[4], q[1]);
+  store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+
+  load_from_output(out, 24, 25, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+}
+
+static INLINE void highbd_idct32_bands_end_2nd_pass_bd8(
+    const int16_t *const out, uint16_t *const dest, const int stride,
+    int16x8_t *const q) {
+  uint16_t *dest0 = dest + 0 * stride;
+  uint16_t *dest1 = dest + 31 * stride;
+  uint16_t *dest2 = dest + 16 * stride;
+  uint16_t *dest3 = dest + 15 * stride;
+  const int str2 = stride << 1;
+
+  highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+                                   q[9]);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 30, 31, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+                                   q[7]);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 12, 13, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[10], q[1]);
+  q[3] = vaddq_s16(q[11], q[0]);
+  q[4] = vsubq_s16(q[11], q[0]);
+  q[5] = vsubq_s16(q[10], q[1]);
+
+  load_from_output(out, 18, 19, &q[0], &q[1]);
+  q[8] = final_add(q[4], q[1]);
+  q[9] = final_add(q[5], q[0]);
+  q[6] = final_sub(q[5], q[0]);
+  q[7] = final_sub(q[4], q[1]);
+  highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+                                   q[9]);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 28, 29, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+                                   q[7]);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 10, 11, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[12], q[1]);
+  q[3] = vaddq_s16(q[13], q[0]);
+  q[4] = vsubq_s16(q[13], q[0]);
+  q[5] = vsubq_s16(q[12], q[1]);
+
+  load_from_output(out, 20, 21, &q[0], &q[1]);
+  q[8] = final_add(q[4], q[1]);
+  q[9] = final_add(q[5], q[0]);
+  q[6] = final_sub(q[5], q[0]);
+  q[7] = final_sub(q[4], q[1]);
+  highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+                                   q[9]);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 26, 27, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+                                   q[7]);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 8, 9, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[14], q[1]);
+  q[3] = vaddq_s16(q[15], q[0]);
+  q[4] = vsubq_s16(q[15], q[0]);
+  q[5] = vsubq_s16(q[14], q[1]);
+
+  load_from_output(out, 22, 23, &q[0], &q[1]);
+  q[8] = final_add(q[4], q[1]);
+  q[9] = final_add(q[5], q[0]);
+  q[6] = final_sub(q[5], q[0]);
+  q[7] = final_sub(q[4], q[1]);
+  highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+                                   q[9]);
+
+  load_from_output(out, 24, 25, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+                                   q[7]);
+}
+
+void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest,
+                        const int stride, const int highbd_flag) {
   int i, idct32_pass_loop;
   int16_t trans_buf[32 * 8];
   int16_t pass1[32 * 32];
   int16_t pass2[32 * 32];
+  const int16_t *input_pass2 = pass1;  // input of pass2 is the result of pass1
   int16_t *out;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int16x8_t q[16];
+  uint16_t *dst = CAST_TO_SHORTPTR(dest);
 
   for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
-       idct32_pass_loop++,
-      input = pass1,  // the input of pass2 is the result of pass1
-       out = pass2) {
-    for (i = 0; i < 4; i++, input += 32 * 8, out += 8) {  // idct32_bands_loop
-      idct32_transpose_pair(input, trans_buf);
+       idct32_pass_loop++, out = pass2) {
+    for (i = 0; i < 4; i++, out += 8) {  // idct32_bands_loop
+      if (idct32_pass_loop == 0) {
+        idct32_transpose_pair_tran_low(input, trans_buf);
+        input += 32 * 8;
+      } else {
+        idct32_transpose_pair(input_pass2, trans_buf);
+        input_pass2 += 32 * 8;
+      }
 
       // -----------------------------------------
       // BLOCK A: 16-19,28-31
       // -----------------------------------------
       // generate 16,17,30,31
       // part of stage 1
-      LOAD_FROM_TRANSPOSED(0, 1, 31)
-      DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(31, 17, 15)
-      DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
+      load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]);
       // part of stage 2
-      q4s16 = vaddq_s16(q0s16, q1s16);
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q6s16 = vaddq_s16(q2s16, q3s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
+      q[4] = vaddq_s16(q[0], q[1]);
+      q[13] = vsubq_s16(q[0], q[1]);
+      q[6] = vaddq_s16(q[2], q[3]);
+      q[14] = vsubq_s16(q[2], q[3]);
       // part of stage 3
-      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
+      do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]);
 
       // generate 18,19,28,29
       // part of stage 1
-      LOAD_FROM_TRANSPOSED(15, 9, 23)
-      DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(23, 25, 7)
-      DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
+      load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]);
       // part of stage 2
-      q13s16 = vsubq_s16(q3s16, q2s16);
-      q3s16 = vaddq_s16(q3s16, q2s16);
-      q14s16 = vsubq_s16(q1s16, q0s16);
-      q2s16 = vaddq_s16(q1s16, q0s16);
+      q[13] = vsubq_s16(q[3], q[2]);
+      q[3] = vaddq_s16(q[3], q[2]);
+      q[14] = vsubq_s16(q[1], q[0]);
+      q[2] = vaddq_s16(q[1], q[0]);
       // part of stage 3
-      DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
+      do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]);
       // part of stage 4
-      q8s16 = vaddq_s16(q4s16, q2s16);
-      q9s16 = vaddq_s16(q5s16, q0s16);
-      q10s16 = vaddq_s16(q7s16, q1s16);
-      q15s16 = vaddq_s16(q6s16, q3s16);
-      q13s16 = vsubq_s16(q5s16, q0s16);
-      q14s16 = vsubq_s16(q7s16, q1s16);
-      STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
-      STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
+      q[8] = vaddq_s16(q[4], q[2]);
+      q[9] = vaddq_s16(q[5], q[0]);
+      q[10] = vaddq_s16(q[7], q[1]);
+      q[15] = vaddq_s16(q[6], q[3]);
+      q[13] = vsubq_s16(q[5], q[0]);
+      q[14] = vsubq_s16(q[7], q[1]);
+      store_in_output(out, 16, 31, q[8], q[15]);
+      store_in_output(out, 17, 30, q[9], q[10]);
       // part of stage 5
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
-      STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]);
+      store_in_output(out, 29, 18, q[1], q[0]);
       // part of stage 4
-      q13s16 = vsubq_s16(q4s16, q2s16);
-      q14s16 = vsubq_s16(q6s16, q3s16);
+      q[13] = vsubq_s16(q[4], q[2]);
+      q[14] = vsubq_s16(q[6], q[3]);
       // part of stage 5
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
-      STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]);
+      store_in_output(out, 19, 28, q[4], q[6]);
 
       // -----------------------------------------
       // BLOCK B: 20-23,24-27
       // -----------------------------------------
       // generate 20,21,26,27
       // part of stage 1
-      LOAD_FROM_TRANSPOSED(7, 5, 27)
-      DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(27, 21, 11)
-      DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
+      load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]);
       // part of stage 2
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q0s16 = vaddq_s16(q0s16, q1s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
-      q2s16 = vaddq_s16(q2s16, q3s16);
+      q[13] = vsubq_s16(q[0], q[1]);
+      q[0] = vaddq_s16(q[0], q[1]);
+      q[14] = vsubq_s16(q[2], q[3]);
+      q[2] = vaddq_s16(q[2], q[3]);
       // part of stage 3
-      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+      do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
 
       // generate 22,23,24,25
       // part of stage 1
-      LOAD_FROM_TRANSPOSED(11, 13, 19)
-      DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
-      LOAD_FROM_TRANSPOSED(19, 29, 3)
-      DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
+      load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]);
+      load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]);
       // part of stage 2
-      q14s16 = vsubq_s16(q4s16, q5s16);
-      q5s16 = vaddq_s16(q4s16, q5s16);
-      q13s16 = vsubq_s16(q6s16, q7s16);
-      q6s16 = vaddq_s16(q6s16, q7s16);
+      q[14] = vsubq_s16(q[4], q[5]);
+      q[5] = vaddq_s16(q[4], q[5]);
+      q[13] = vsubq_s16(q[6], q[7]);
+      q[6] = vaddq_s16(q[6], q[7]);
       // part of stage 3
-      DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
+      do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]);
       // part of stage 4
-      q10s16 = vaddq_s16(q7s16, q1s16);
-      q11s16 = vaddq_s16(q5s16, q0s16);
-      q12s16 = vaddq_s16(q6s16, q2s16);
-      q15s16 = vaddq_s16(q4s16, q3s16);
+      q[10] = vaddq_s16(q[7], q[1]);
+      q[11] = vaddq_s16(q[5], q[0]);
+      q[12] = vaddq_s16(q[6], q[2]);
+      q[15] = vaddq_s16(q[4], q[3]);
       // part of stage 6
-      LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
-      q8s16 = vaddq_s16(q14s16, q11s16);
-      q9s16 = vaddq_s16(q13s16, q10s16);
-      q13s16 = vsubq_s16(q13s16, q10s16);
-      q11s16 = vsubq_s16(q14s16, q11s16);
-      STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
-      LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
-      q8s16 = vsubq_s16(q9s16, q12s16);
-      q10s16 = vaddq_s16(q14s16, q15s16);
-      q14s16 = vsubq_s16(q14s16, q15s16);
-      q12s16 = vaddq_s16(q9s16, q12s16);
-      STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
+      load_from_output(out, 16, 17, &q[14], &q[13]);
+      q[8] = vaddq_s16(q[14], q[11]);
+      q[9] = vaddq_s16(q[13], q[10]);
+      q[13] = vsubq_s16(q[13], q[10]);
+      q[11] = vsubq_s16(q[14], q[11]);
+      store_in_output(out, 17, 16, q[9], q[8]);
+      load_from_output(out, 30, 31, &q[14], &q[9]);
+      q[8] = vsubq_s16(q[9], q[12]);
+      q[10] = vaddq_s16(q[14], q[15]);
+      q[14] = vsubq_s16(q[14], q[15]);
+      q[12] = vaddq_s16(q[9], q[12]);
+      store_in_output(out, 30, 31, q[10], q[12]);
       // part of stage 7
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-      STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
-      q13s16 = q11s16;
-      q14s16 = q8s16;
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-      STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+      store_in_output(out, 25, 22, q[14], q[13]);
+      do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+      store_in_output(out, 24, 23, q[14], q[13]);
       // part of stage 4
-      q14s16 = vsubq_s16(q5s16, q0s16);
-      q13s16 = vsubq_s16(q6s16, q2s16);
-      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
-      q14s16 = vsubq_s16(q7s16, q1s16);
-      q13s16 = vsubq_s16(q4s16, q3s16);
-      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
+      q[14] = vsubq_s16(q[5], q[0]);
+      q[13] = vsubq_s16(q[6], q[2]);
+      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]);
+      q[14] = vsubq_s16(q[7], q[1]);
+      q[13] = vsubq_s16(q[4], q[3]);
+      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]);
       // part of stage 6
-      LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
-      q8s16 = vaddq_s16(q14s16, q1s16);
-      q9s16 = vaddq_s16(q13s16, q6s16);
-      q13s16 = vsubq_s16(q13s16, q6s16);
-      q1s16 = vsubq_s16(q14s16, q1s16);
-      STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
-      LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
-      q14s16 = vsubq_s16(q8s16, q5s16);
-      q10s16 = vaddq_s16(q8s16, q5s16);
-      q11s16 = vaddq_s16(q9s16, q0s16);
-      q0s16 = vsubq_s16(q9s16, q0s16);
-      STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
+      load_from_output(out, 18, 19, &q[14], &q[13]);
+      q[8] = vaddq_s16(q[14], q[1]);
+      q[9] = vaddq_s16(q[13], q[6]);
+      q[13] = vsubq_s16(q[13], q[6]);
+      q[1] = vsubq_s16(q[14], q[1]);
+      store_in_output(out, 18, 19, q[8], q[9]);
+      load_from_output(out, 28, 29, &q[8], &q[9]);
+      q[14] = vsubq_s16(q[8], q[5]);
+      q[10] = vaddq_s16(q[8], q[5]);
+      q[11] = vaddq_s16(q[9], q[0]);
+      q[0] = vsubq_s16(q[9], q[0]);
+      store_in_output(out, 28, 29, q[10], q[11]);
       // part of stage 7
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-      STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
-      DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
-      STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+      store_in_output(out, 20, 27, q[13], q[14]);
+      do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]);
+      store_in_output(out, 21, 26, q[1], q[0]);
 
       // -----------------------------------------
       // BLOCK C: 8-10,11-15
       // -----------------------------------------
       // generate 8,9,14,15
       // part of stage 2
-      LOAD_FROM_TRANSPOSED(3, 2, 30)
-      DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(30, 18, 14)
-      DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
+      load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]);
       // part of stage 3
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q0s16 = vaddq_s16(q0s16, q1s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
-      q2s16 = vaddq_s16(q2s16, q3s16);
+      q[13] = vsubq_s16(q[0], q[1]);
+      q[0] = vaddq_s16(q[0], q[1]);
+      q[14] = vsubq_s16(q[2], q[3]);
+      q[2] = vaddq_s16(q[2], q[3]);
       // part of stage 4
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]);
 
       // generate 10,11,12,13
       // part of stage 2
-      LOAD_FROM_TRANSPOSED(14, 10, 22)
-      DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
-      LOAD_FROM_TRANSPOSED(22, 26, 6)
-      DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
+      load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]);
+      load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]);
       // part of stage 3
-      q14s16 = vsubq_s16(q4s16, q5s16);
-      q5s16 = vaddq_s16(q4s16, q5s16);
-      q13s16 = vsubq_s16(q6s16, q7s16);
-      q6s16 = vaddq_s16(q6s16, q7s16);
+      q[14] = vsubq_s16(q[4], q[5]);
+      q[5] = vaddq_s16(q[4], q[5]);
+      q[13] = vsubq_s16(q[6], q[7]);
+      q[6] = vaddq_s16(q[6], q[7]);
       // part of stage 4
-      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
+      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]);
       // part of stage 5
-      q8s16 = vaddq_s16(q0s16, q5s16);
-      q9s16 = vaddq_s16(q1s16, q7s16);
-      q13s16 = vsubq_s16(q1s16, q7s16);
-      q14s16 = vsubq_s16(q3s16, q4s16);
-      q10s16 = vaddq_s16(q3s16, q4s16);
-      q15s16 = vaddq_s16(q2s16, q6s16);
-      STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
-      STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
+      q[8] = vaddq_s16(q[0], q[5]);
+      q[9] = vaddq_s16(q[1], q[7]);
+      q[13] = vsubq_s16(q[1], q[7]);
+      q[14] = vsubq_s16(q[3], q[4]);
+      q[10] = vaddq_s16(q[3], q[4]);
+      q[15] = vaddq_s16(q[2], q[6]);
+      store_in_output(out, 8, 15, q[8], q[15]);
+      store_in_output(out, 9, 14, q[9], q[10]);
       // part of stage 6
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-      STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
-      q13s16 = vsubq_s16(q0s16, q5s16);
-      q14s16 = vsubq_s16(q2s16, q6s16);
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-      STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+      store_in_output(out, 13, 10, q[3], q[1]);
+      q[13] = vsubq_s16(q[0], q[5]);
+      q[14] = vsubq_s16(q[2], q[6]);
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+      store_in_output(out, 11, 12, q[1], q[3]);
 
       // -----------------------------------------
       // BLOCK D: 0-3,4-7
       // -----------------------------------------
       // generate 4,5,6,7
       // part of stage 3
-      LOAD_FROM_TRANSPOSED(6, 4, 28)
-      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(28, 20, 12)
-      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+      load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
       // part of stage 4
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q0s16 = vaddq_s16(q0s16, q1s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
-      q2s16 = vaddq_s16(q2s16, q3s16);
+      q[13] = vsubq_s16(q[0], q[1]);
+      q[0] = vaddq_s16(q[0], q[1]);
+      q[14] = vsubq_s16(q[2], q[3]);
+      q[2] = vaddq_s16(q[2], q[3]);
       // part of stage 5
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
 
       // generate 0,1,2,3
       // part of stage 4
-      LOAD_FROM_TRANSPOSED(12, 0, 16)
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
-      LOAD_FROM_TRANSPOSED(16, 8, 24)
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
+      load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]);
+      load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]);
       // part of stage 5
-      q4s16 = vaddq_s16(q7s16, q6s16);
-      q7s16 = vsubq_s16(q7s16, q6s16);
-      q6s16 = vsubq_s16(q5s16, q14s16);
-      q5s16 = vaddq_s16(q5s16, q14s16);
+      q[4] = vaddq_s16(q[7], q[6]);
+      q[7] = vsubq_s16(q[7], q[6]);
+      q[6] = vsubq_s16(q[5], q[14]);
+      q[5] = vaddq_s16(q[5], q[14]);
       // part of stage 6
-      q8s16 = vaddq_s16(q4s16, q2s16);
-      q9s16 = vaddq_s16(q5s16, q3s16);
-      q10s16 = vaddq_s16(q6s16, q1s16);
-      q11s16 = vaddq_s16(q7s16, q0s16);
-      q12s16 = vsubq_s16(q7s16, q0s16);
-      q13s16 = vsubq_s16(q6s16, q1s16);
-      q14s16 = vsubq_s16(q5s16, q3s16);
-      q15s16 = vsubq_s16(q4s16, q2s16);
+      q[8] = vaddq_s16(q[4], q[2]);
+      q[9] = vaddq_s16(q[5], q[3]);
+      q[10] = vaddq_s16(q[6], q[1]);
+      q[11] = vaddq_s16(q[7], q[0]);
+      q[12] = vsubq_s16(q[7], q[0]);
+      q[13] = vsubq_s16(q[6], q[1]);
+      q[14] = vsubq_s16(q[5], q[3]);
+      q[15] = vsubq_s16(q[4], q[2]);
       // part of stage 7
-      LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
-      q2s16 = vaddq_s16(q8s16, q1s16);
-      q3s16 = vaddq_s16(q9s16, q0s16);
-      q4s16 = vsubq_s16(q9s16, q0s16);
-      q5s16 = vsubq_s16(q8s16, q1s16);
-      LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
-      q8s16 = vaddq_s16(q4s16, q1s16);
-      q9s16 = vaddq_s16(q5s16, q0s16);
-      q6s16 = vsubq_s16(q5s16, q0s16);
-      q7s16 = vsubq_s16(q4s16, q1s16);
+      load_from_output(out, 14, 15, &q[0], &q[1]);
+      q[2] = vaddq_s16(q[8], q[1]);
+      q[3] = vaddq_s16(q[9], q[0]);
+      q[4] = vsubq_s16(q[9], q[0]);
+      q[5] = vsubq_s16(q[8], q[1]);
+      load_from_output(out, 16, 17, &q[0], &q[1]);
+      q[8] = final_add(q[4], q[1]);
+      q[9] = final_add(q[5], q[0]);
+      q[6] = final_sub(q[5], q[0]);
+      q[7] = final_sub(q[4], q[1]);
 
       if (idct32_pass_loop == 0) {
-        idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
-                                  q10s16, q11s16, q12s16, q13s16, q14s16,
-                                  q15s16);
+        idct32_bands_end_1st_pass(out, q);
       } else {
-        idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
-                                  q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
-                                  q14s16, q15s16);
-        dest += 8;
+        if (highbd_flag) {
+          highbd_idct32_bands_end_2nd_pass_bd8(out, dst, stride, q);
+          dst += 8;
+        } else {
+          idct32_bands_end_2nd_pass(out, dest, stride, q);
+          dest += 8;
+        }
       }
     }
   }
-  return;
+}
+
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
+                                 int stride) {
+  vpx_idct32_32_neon(input, dest, stride, 0);
 }
diff --git a/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
index adab715dde..d83421e9e6 100644
--- a/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
+++ b/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm
@@ -15,19 +15,17 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
-;                                  int dest_stride)
+;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
-; r2  int dest_stride)
+; r2  int stride)
 
 |vpx_idct4x4_1_add_neon| PROC
     ldrsh            r0, [r0]
 
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
+    ; cospi_16_64 = 11585
+    movw             r12, #0x2d41
 
     ; out = dct_const_round_shift(input[0] * cospi_16_64)
     mul              r0, r0, r12               ; input[0] * cospi_16_64
diff --git a/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
index 9f999e979d..a14b895431 100644
--- a/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -9,38 +9,39 @@
  */
 
 #include <arm_neon.h>
+#include <assert.h>
 
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
 
-void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x8_t d6u8;
-  uint32x2_t d2u32 = vdup_n_u32(0);
-  uint16x8_t q8u16;
-  int16x8_t q0s16;
-  uint8_t *d1, *d2;
-  int16_t i, a1, cospi_16_64 = 11585;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  q0s16 = vdupq_n_s16(a1);
-
-  // dc_only_idct_add
-  d1 = d2 = dest;
-  for (i = 0; i < 2; i++) {
-    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
-    d1 += dest_stride;
-    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
-    d1 += dest_stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32));
-    d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-
-    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
-    d2 += dest_stride;
-    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
-    d2 += dest_stride;
-  }
-  return;
+static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
+                                        const int16x8_t res,
+                                        uint32x2_t *const d) {
+  uint16x8_t a;
+  uint8x8_t b;
+  *d = vld1_lane_u32((const uint32_t *)*dest, *d, 0);
+  *d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1);
+  a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d));
+  b = vqmovun_s16(vreinterpretq_s16_u16(a));
+  vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0);
+  *dest += stride;
+  vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1);
+  *dest += stride;
+}
+
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  const int16_t out0 =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+  const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
+  const int16x8_t dc = vdupq_n_s16(a1);
+  uint32x2_t d = vdup_n_u32(0);
+
+  assert(!((intptr_t)dest % sizeof(uint32_t)));
+  assert(!(stride % sizeof(uint32_t)));
+
+  idct4x4_1_add_kernel(&dest, stride, dc, &d);
+  idct4x4_1_add_kernel(&dest, stride, dc, &d);
 }
diff --git a/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
index 877fbd6343..184d218941 100644
--- a/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
+++ b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -15,12 +15,14 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
+    INCLUDE vpx_dsp/arm/idct_neon.asm.S
+
     AREA     Block, CODE, READONLY ; name this block of code
-;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
-; r2  int dest_stride)
+; r2  int stride)
 
 |vpx_idct4x4_16_add_neon| PROC
 
@@ -33,18 +35,15 @@
     ; So, two passes of a transpose followed by a column transform.
 
     ; load the inputs into q8-q9, d16-d19
-    vld1.s16        {q8,q9}, [r0]!
+    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
 
     ; generate scalar constants
-    ; cospi_8_64 = 15137 = 0x3b21
-    mov             r0, #0x3b00
-    add             r0, #0x21
-    ; cospi_16_64 = 11585 = 0x2d41
-    mov             r3, #0x2d00
-    add             r3, #0x41
-    ; cospi_24_64 = 6270 = 0x 187e
-    mov             r12, #0x1800
-    add             r12, #0x7e
+    ; cospi_8_64 = 15137
+    movw            r0, #0x3b21
+    ; cospi_16_64 = 11585
+    movw            r3, #0x2d41
+    ; cospi_24_64 = 6270
+    movw            r12, #0x187e
 
     ; transpose the input data
     ; 00 01 02 03   d16
@@ -73,16 +72,15 @@
     ; do the transform on transposed rows
 
     ; stage 1
-    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
-    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
-
     vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
     vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
 
     ; (input[0] + input[2]) * cospi_16_64;
     ; (input[0] - input[2]) * cospi_16_64;
-    vmull.s16 q13, d23, d21
-    vmull.s16 q14, d24, d21
+    vmull.s16 q8,  d16, d21
+    vmull.s16 q14, d18, d21
+    vadd.s32  q13, q8,  q14
+    vsub.s32  q14, q8,  q14
 
     ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
     ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
@@ -90,10 +88,10 @@
     vmlal.s16 q1,  d19, d22
 
     ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q1,  #14
+    vrshrn.s32 d26, q13, #14
+    vrshrn.s32 d27, q14, #14
+    vrshrn.s32 d29, q15, #14
+    vrshrn.s32 d28, q1,  #14
 
     ; stage 2
     ; output[0] = step[0] + step[3];
@@ -141,10 +139,10 @@
     vmlal.s16 q1,  d19, d22
 
     ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q1,  #14
+    vrshrn.s32 d26, q13, #14
+    vrshrn.s32 d27, q14, #14
+    vrshrn.s32 d29, q15, #14
+    vrshrn.s32 d28, q1,  #14
 
     ; stage 2
     ; output[0] = step[0] + step[3];
@@ -169,7 +167,7 @@
     vld1.32 {d27[1]}, [r1], r2
     vld1.32 {d27[0]}, [r1]  ; no post-increment
 
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i]
     vaddw.u8 q8, q8, d26
     vaddw.u8 q9, q9, d27
 
diff --git a/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
index 382626928b..673a36840e 100644
--- a/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
@@ -9,138 +9,54 @@
  */
 
 #include <arm_neon.h>
+#include <assert.h>
 
-void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x8_t d26u8, d27u8;
-  uint32x2_t d26u32, d27u32;
-  uint16x8_t q8u16, q9u16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
-  int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
-  int16x8_t q8s16, q9s16, q13s16, q14s16;
-  int32x4_t q1s32, q13s32, q14s32, q15s32;
-  int16x4x2_t d0x2s16, d1x2s16;
-  int32x4x2_t q0x2s32;
-  uint8_t *d;
-  int16_t cospi_8_64 = 15137;
-  int16_t cospi_16_64 = 11585;
-  int16_t cospi_24_64 = 6270;
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
 
-  d26u32 = d27u32 = vdup_n_u32(0);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  const uint8_t *dst = dest;
+  const int16x4_t cospis = vld1_s16(kCospi);
+  uint8x8_t dest01_u8;
+  uint32x2_t dest32_u32 = vdup_n_u32(0);
+  int16x8_t a0, a1;
+  uint8x8_t d01, d32;
+  uint16x8_t d01_u16, d32_u16;
 
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
+  assert(!((intptr_t)dest % sizeof(uint32_t)));
+  assert(!(stride % sizeof(uint32_t)));
 
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
+  // Rows
+  a0 = load_tran_low_to_s16q(input);
+  a1 = load_tran_low_to_s16q(input + 8);
+  idct4x4_16_kernel_bd8(cospis, &a0, &a1);
 
-  d0x2s16 = vtrn_s16(d16s16, d17s16);
-  d1x2s16 = vtrn_s16(d18s16, d19s16);
-  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
-  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+  // Columns
+  a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
+  idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+  a0 = vrshrq_n_s16(a0, 4);
+  a1 = vrshrq_n_s16(a1, 4);
 
-  d20s16 = vdup_n_s16(cospi_8_64);
-  d21s16 = vdup_n_s16(cospi_16_64);
+  dest01_u8 = load_u8(dst, stride);
+  dst += 2 * stride;
+  // The elements are loaded in reverse order.
+  dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1);
+  dst += stride;
+  dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0);
 
-  q0x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
-  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+  d01_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), dest01_u8);
+  d32_u16 =
+      vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32));
+  d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16));
+  d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16));
 
-  d22s16 = vdup_n_s16(cospi_24_64);
-
-  // stage 1
-  d23s16 = vadd_s16(d16s16, d18s16);
-  d24s16 = vsub_s16(d16s16, d18s16);
-
-  q15s32 = vmull_s16(d17s16, d22s16);
-  q1s32 = vmull_s16(d17s16, d20s16);
-  q13s32 = vmull_s16(d23s16, d21s16);
-  q14s32 = vmull_s16(d24s16, d21s16);
-
-  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
-  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
-
-  d26s16 = vqrshrn_n_s32(q13s32, 14);
-  d27s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d28s16 = vqrshrn_n_s32(q1s32, 14);
-  q13s16 = vcombine_s16(d26s16, d27s16);
-  q14s16 = vcombine_s16(d28s16, d29s16);
-
-  // stage 2
-  q8s16 = vaddq_s16(q13s16, q14s16);
-  q9s16 = vsubq_s16(q13s16, q14s16);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
-  d19s16 = vget_low_s16(q9s16);
-
-  d0x2s16 = vtrn_s16(d16s16, d17s16);
-  d1x2s16 = vtrn_s16(d18s16, d19s16);
-  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
-  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
-  q0x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
-  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
-  // do the transform on columns
-  // stage 1
-  d23s16 = vadd_s16(d16s16, d18s16);
-  d24s16 = vsub_s16(d16s16, d18s16);
-
-  q15s32 = vmull_s16(d17s16, d22s16);
-  q1s32 = vmull_s16(d17s16, d20s16);
-  q13s32 = vmull_s16(d23s16, d21s16);
-  q14s32 = vmull_s16(d24s16, d21s16);
-
-  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
-  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
-
-  d26s16 = vqrshrn_n_s32(q13s32, 14);
-  d27s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d28s16 = vqrshrn_n_s32(q1s32, 14);
-  q13s16 = vcombine_s16(d26s16, d27s16);
-  q14s16 = vcombine_s16(d28s16, d29s16);
-
-  // stage 2
-  q8s16 = vaddq_s16(q13s16, q14s16);
-  q9s16 = vsubq_s16(q13s16, q14s16);
-
-  q8s16 = vrshrq_n_s16(q8s16, 4);
-  q9s16 = vrshrq_n_s16(q9s16, 4);
-
-  d = dest;
-  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
-  d += dest_stride;
-  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
-  d += dest_stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
-  d += dest_stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
-
-  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
-  d = dest;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
-  d += dest_stride;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
-  d += dest_stride;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
-  d += dest_stride;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
-  return;
+  store_u8(dest, stride, d01);
+  dest += 2 * stride;
+  // The elements are stored in reverse order.
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1);
+  dest += stride;
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0);
 }
diff --git a/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
deleted file mode 100644
index dbbff364f3..0000000000
--- a/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
+++ /dev/null
@@ -1,88 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vpx_idct8x8_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
-;                                  int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vpx_idct8x8_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 5)
-    add              r0, r0, #16               ; + (1 <<((5) - 1))
-    asr              r0, r0, #5                ; >> 5
-
-    vdup.s16         q0, r0                    ; duplicate a1
-
-    ; load destination data
-    vld1.64          {d2}, [r1], r2
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r2
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r2
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r2
-    vld1.64          {d17}, [r1]
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r2
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r2
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r2
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r2
-    vst1.64          {d31}, [r12], r2
-
-    bx               lr
-    ENDP             ; |vpx_idct8x8_1_add_neon|
-
-    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
index e3db0b876b..ce9b459589 100644
--- a/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
@@ -10,52 +10,56 @@
 
 #include <arm_neon.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/inv_txfm.h"
-#include "vpx_ports/mem.h"
 
-void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x8_t d2u8, d3u8, d30u8, d31u8;
-  uint64x1_t d2u64, d3u64, d4u64, d5u64;
-  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
-  int16x8_t q0s16;
-  uint8_t *d1, *d2;
-  int16_t i, a1, cospi_16_64 = 11585;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-
-  q0s16 = vdupq_n_s16(a1);
-  q0u16 = vreinterpretq_u16_s16(q0s16);
-
-  d1 = d2 = dest;
-  for (i = 0; i < 2; i++) {
-    d2u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-    d3u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-    d4u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-    d5u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-
-    q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
-    q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
-    q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
-    q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-    d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
-    d2 += dest_stride;
-  }
-  return;
+static INLINE uint8x8_t create_dcd(const int16_t dc) {
+  int16x8_t t = vdupq_n_s16(dc);
+  return vqmovun_s16(t);
+}
+
+static INLINE void idct8x8_1_add_pos_kernel(uint8_t **dest, const int stride,
+                                            const uint8x8_t res) {
+  const uint8x8_t a = vld1_u8(*dest);
+  const uint8x8_t b = vqadd_u8(a, res);
+  vst1_u8(*dest, b);
+  *dest += stride;
+}
+
+static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride,
+                                            const uint8x8_t res) {
+  const uint8x8_t a = vld1_u8(*dest);
+  const uint8x8_t b = vqsub_u8(a, res);
+  vst1_u8(*dest, b);
+  *dest += stride;
+}
+
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  const int16_t out0 =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+  const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
+
+  if (a1 >= 0) {
+    const uint8x8_t dc = create_dcd(a1);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+    idct8x8_1_add_pos_kernel(&dest, stride, dc);
+  } else {
+    const uint8x8_t dc = create_dcd(-a1);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    idct8x8_1_add_neg_kernel(&dest, stride, dc);
+  }
 }
diff --git a/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm b/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
deleted file mode 100644
index 6ab59b41b7..0000000000
--- a/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
+++ /dev/null
@@ -1,519 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vpx_idct8x8_64_add_neon|
-    EXPORT  |vpx_idct8x8_12_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
-    ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
-    ; This macro will touch q0-q7 registers and use them as buffer during
-    ; calculation.
-    MACRO
-    IDCT8x8_1D
-    ; stage 1
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r4                    ; duplicate cospi_4_64
-    vdup.16         d2, r5                    ; duplicate cospi_12_64
-    vdup.16         d3, r6                    ; duplicate cospi_20_64
-
-    ; input[1] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q5, d26, d2
-    vmull.s16       q6, d27, d2
-
-    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q5, d22, d3
-    vmlsl.s16       q6, d23, d3
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; input[1] * cospi_4_64
-    vmull.s16       q2, d18, d1
-    vmull.s16       q3, d19, d1
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q13, d27, d3
-
-    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vmlal.s16       q2, d30, d0
-    vmlal.s16       q3, d31, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q13, d23, d2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d14, q2, #14              ; >> 14
-    vqrshrn.s32     d15, q3, #14              ; >> 14
-
-    ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7                    ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q13, #14              ; >> 14
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q13, d16, d0
-    vmull.s16       q15, d17, d0
-
-    ; (input[0] + input[2]) * cospi_16_64
-    vmlal.s16       q2,  d24, d0
-    vmlal.s16       q3, d25, d0
-
-    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16       q13, d24, d0
-    vmlsl.s16       q15, d25, d0
-
-    vdup.16         d0, r8                    ; duplicate cospi_24_64
-    vdup.16         d1, r9                    ; duplicate cospi_8_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q2, #14              ; >> 14
-    vqrshrn.s32     d19, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q13, #14              ; >> 14
-    vqrshrn.s32     d23, q15, #14              ; >> 14
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    ; input[1] * cospi_24_64
-    vmull.s16       q2, d20, d0
-    vmull.s16       q3, d21, d0
-
-    ; input[1] * cospi_8_64
-    vmull.s16       q8, d20, d1
-    vmull.s16       q12, d21, d1
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q2, d28, d1
-    vmlsl.s16       q3, d29, d1
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q8, d28, d0
-    vmlal.s16       q12, d29, d0
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d26, q2, #14              ; >> 14
-    vqrshrn.s32     d27, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q8, #14              ; >> 14
-    vqrshrn.s32     d31, q12, #14              ; >> 14
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14              ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-    MEND
-
-    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vpx_idct8x8_64_add_neon| PROC
-    push            {r4-r9}
-    vpush           {d8-d15}
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-
-    ; First transform rows
-    IDCT8x8_1D
-
-    ; Transpose the matrix
-    TRANSPOSE8X8
-
-    ; Then transform columns
-    IDCT8x8_1D
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r9}
-    bx              lr
-    ENDP  ; |vpx_idct8x8_64_add_neon|
-
-;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|vpx_idct8x8_12_add_neon| PROC
-    push            {r4-r9}
-    vpush           {d8-d15}
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-
-    ; First transform rows
-    ; stage 1
-    ; The following instructions use vqrdmulh to do the
-    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
-    ; multiply and shift the result by 16 bits instead of 14 bits. So we need
-    ; to double the constants before multiplying to compensate this.
-    mov             r12, r3, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_28_64*2
-    mov             r12, r4, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_28_64)
-    vqrdmulh.s16    q4, q9, q0
-
-    mov             r12, r6, lsl #1
-    rsb             r12, #0
-    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_4_64)
-    vqrdmulh.s16    q7, q9, q1
-
-    mov             r12, r5, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_12_64*2
-
-    ; dct_const_round_shift(- input[3] * cospi_20_64)
-    vqrdmulh.s16    q5, q11, q0
-
-    mov             r12, r7, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_16_64*2
-
-    ; dct_const_round_shift(input[3] * cospi_12_64)
-    vqrdmulh.s16    q6, q11, q1
-
-    ; stage 2 & stage 3 - even half
-    mov             r12, r8, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_24_64*2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrdmulh.s16    q9, q8, q0
-
-    mov             r12, r9, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_8_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_24_64)
-    vqrdmulh.s16    q13, q10, q1
-
-    ; dct_const_round_shift(input[1] * cospi_8_64)
-    vqrdmulh.s16    q15, q10, q0
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14              ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-
-    ; Transpose the matrix
-    TRANSPOSE8X8
-
-    ; Then transform columns
-    IDCT8x8_1D
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r9}
-    bx              lr
-    ENDP  ; |vpx_idct8x8_12_add_neon|
-
-    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
index 82b3182568..1121ade279 100644
--- a/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
@@ -11,435 +11,117 @@
 #include <arm_neon.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                              int16x8_t *q10s16, int16x8_t *q11s16,
-                              int16x8_t *q12s16, int16x8_t *q13s16,
-                              int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+static INLINE void add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
+                          int16x8_t a3, int16x8_t a4, int16x8_t a5,
+                          int16x8_t a6, int16x8_t a7, uint8_t *dest,
+                          const int stride) {
+  const uint8_t *dst = dest;
+  uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+  uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
 
-  d0s16 = vdup_n_s16(cospi_28_64);
-  d1s16 = vdup_n_s16(cospi_4_64);
-  d2s16 = vdup_n_s16(cospi_12_64);
-  d3s16 = vdup_n_s16(cospi_20_64);
+  a0 = vrshrq_n_s16(a0, 5);
+  a1 = vrshrq_n_s16(a1, 5);
+  a2 = vrshrq_n_s16(a2, 5);
+  a3 = vrshrq_n_s16(a3, 5);
+  a4 = vrshrq_n_s16(a4, 5);
+  a5 = vrshrq_n_s16(a5, 5);
+  a6 = vrshrq_n_s16(a6, 5);
+  a7 = vrshrq_n_s16(a7, 5);
 
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
+  d0 = vld1_u8(dst);
+  dst += stride;
+  d1 = vld1_u8(dst);
+  dst += stride;
+  d2 = vld1_u8(dst);
+  dst += stride;
+  d3 = vld1_u8(dst);
+  dst += stride;
+  d4 = vld1_u8(dst);
+  dst += stride;
+  d5 = vld1_u8(dst);
+  dst += stride;
+  d6 = vld1_u8(dst);
+  dst += stride;
+  d7 = vld1_u8(dst);
 
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d26s16, d2s16);
-  q6s32 = vmull_s16(d27s16, d2s16);
+  d0_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), d0);
+  d1_u16 = vaddw_u8(vreinterpretq_u16_s16(a1), d1);
+  d2_u16 = vaddw_u8(vreinterpretq_u16_s16(a2), d2);
+  d3_u16 = vaddw_u8(vreinterpretq_u16_s16(a3), d3);
+  d4_u16 = vaddw_u8(vreinterpretq_u16_s16(a4), d4);
+  d5_u16 = vaddw_u8(vreinterpretq_u16_s16(a5), d5);
+  d6_u16 = vaddw_u8(vreinterpretq_u16_s16(a6), d6);
+  d7_u16 = vaddw_u8(vreinterpretq_u16_s16(a7), d7);
 
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
-  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+  d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16));
+  d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16));
+  d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16));
+  d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16));
+  d4 = vqmovun_s16(vreinterpretq_s16_u16(d4_u16));
+  d5 = vqmovun_s16(vreinterpretq_s16_u16(d5_u16));
+  d6 = vqmovun_s16(vreinterpretq_s16_u16(d6_u16));
+  d7 = vqmovun_s16(vreinterpretq_s16_u16(d7_u16));
 
-  d8s16 = vqrshrn_n_s32(q2s32, 14);
-  d9s16 = vqrshrn_n_s32(q3s32, 14);
-  d10s16 = vqrshrn_n_s32(q5s32, 14);
-  d11s16 = vqrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q2s32 = vmull_s16(d18s16, d1s16);
-  q3s32 = vmull_s16(d19s16, d1s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q13s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
-  d14s16 = vqrshrn_n_s32(q2s32, 14);
-  d15s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q13s32, 14);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  d0s16 = vdup_n_s16(cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d0s16);
-  q3s32 = vmull_s16(d17s16, d0s16);
-  q13s32 = vmull_s16(d16s16, d0s16);
-  q15s32 = vmull_s16(d17s16, d0s16);
-
-  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
-  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
-  d0s16 = vdup_n_s16(cospi_24_64);
-  d1s16 = vdup_n_s16(cospi_8_64);
-
-  d18s16 = vqrshrn_n_s32(q2s32, 14);
-  d19s16 = vqrshrn_n_s32(q3s32, 14);
-  d22s16 = vqrshrn_n_s32(q13s32, 14);
-  d23s16 = vqrshrn_n_s32(q15s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q2s32 = vmull_s16(d20s16, d0s16);
-  q3s32 = vmull_s16(d21s16, d0s16);
-  q8s32 = vmull_s16(d20s16, d1s16);
-  q12s32 = vmull_s16(d21s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
-  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
-  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
-  d26s16 = vqrshrn_n_s32(q2s32, 14);
-  d27s16 = vqrshrn_n_s32(q3s32, 14);
-  d30s16 = vqrshrn_n_s32(q8s32, 14);
-  d31s16 = vqrshrn_n_s32(q12s32, 14);
-  *q13s16 = vcombine_s16(d26s16, d27s16);
-  *q15s16 = vcombine_s16(d30s16, d31s16);
-
-  q0s16 = vaddq_s16(*q9s16, *q15s16);
-  q1s16 = vaddq_s16(*q11s16, *q13s16);
-  q2s16 = vsubq_s16(*q11s16, *q13s16);
-  q3s16 = vsubq_s16(*q9s16, *q15s16);
-
-  *q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  *q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q7s16, q6s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-
-  d16s16 = vdup_n_s16(cospi_16_64);
-
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-  q11s32 = vmull_s16(d28s16, d16s16);
-  q12s32 = vmull_s16(d29s16, d16s16);
-
-  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
-  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-  d10s16 = vqrshrn_n_s32(q9s32, 14);
-  d11s16 = vqrshrn_n_s32(q10s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q12s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  *q8s16 = vaddq_s16(q0s16, q7s16);
-  *q9s16 = vaddq_s16(q1s16, q6s16);
-  *q10s16 = vaddq_s16(q2s16, q5s16);
-  *q11s16 = vaddq_s16(q3s16, q4s16);
-  *q12s16 = vsubq_s16(q3s16, q4s16);
-  *q13s16 = vsubq_s16(q2s16, q5s16);
-  *q14s16 = vsubq_s16(q1s16, q6s16);
-  *q15s16 = vsubq_s16(q0s16, q7s16);
-  return;
+  vst1_u8(dest, d0);
+  dest += stride;
+  vst1_u8(dest, d1);
+  dest += stride;
+  vst1_u8(dest, d2);
+  dest += stride;
+  vst1_u8(dest, d3);
+  dest += stride;
+  vst1_u8(dest, d4);
+  dest += stride;
+  vst1_u8(dest, d5);
+  dest += stride;
+  vst1_u8(dest, d6);
+  dest += stride;
+  vst1_u8(dest, d7);
 }
 
-void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8_t *d1, *d2;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  uint64x1_t d0u64, d1u64, d2u64, d3u64;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  uint16x8_t q8u16, q9u16, q10u16, q11u16;
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  const int16x8_t cospis = vld1q_s16(kCospi);
+  const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+  const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+  int16x8_t a0 = load_tran_low_to_s16q(input);
+  int16x8_t a1 = load_tran_low_to_s16q(input + 8);
+  int16x8_t a2 = load_tran_low_to_s16q(input + 16);
+  int16x8_t a3 = load_tran_low_to_s16q(input + 24);
+  int16x8_t a4 = load_tran_low_to_s16q(input + 32);
+  int16x8_t a5 = load_tran_low_to_s16q(input + 40);
+  int16x8_t a6 = load_tran_low_to_s16q(input + 48);
+  int16x8_t a7 = load_tran_low_to_s16q(input + 56);
 
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 16);
-  q11s16 = vld1q_s16(input + 24);
-  q12s16 = vld1q_s16(input + 32);
-  q13s16 = vld1q_s16(input + 40);
-  q14s16 = vld1q_s16(input + 48);
-  q15s16 = vld1q_s16(input + 56);
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
-
-  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-             &q15s16);
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
-
-  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-             &q15s16);
-
-  q8s16 = vrshrq_n_s16(q8s16, 5);
-  q9s16 = vrshrq_n_s16(q9s16, 5);
-  q10s16 = vrshrq_n_s16(q10s16, 5);
-  q11s16 = vrshrq_n_s16(q11s16, 5);
-  q12s16 = vrshrq_n_s16(q12s16, 5);
-  q13s16 = vrshrq_n_s16(q13s16, 5);
-  q14s16 = vrshrq_n_s16(q14s16, 5);
-  q15s16 = vrshrq_n_s16(q15s16, 5);
-
-  d1 = d2 = dest;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
-
-  q8s16 = q12s16;
-  q9s16 = q13s16;
-  q10s16 = q14s16;
-  q11s16 = q15s16;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
-  return;
+  idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+  idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+  add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, stride);
 }
 
-void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8_t *d1, *d2;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
-  int16x4_t d26s16, d27s16, d28s16, d29s16;
-  uint64x1_t d0u64, d1u64, d2u64, d3u64;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  uint16x8_t q8u16, q9u16, q10u16, q11u16;
-  int32x4_t q9s32, q10s32, q11s32, q12s32;
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  const int16x8_t cospis = vld1q_s16(kCospi);
+  const int16x8_t cospisd = vaddq_s16(cospis, cospis);
+  const int16x4_t cospis0 = vget_low_s16(cospis);     // cospi 0, 8, 16, 24
+  const int16x4_t cospisd0 = vget_low_s16(cospisd);   // doubled 0, 8, 16, 24
+  const int16x4_t cospisd1 = vget_high_s16(cospisd);  // doubled 4, 12, 20, 28
+  int16x4_t a0, a1, a2, a3, a4, a5, a6, a7;
+  int16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
 
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 16);
-  q11s16 = vld1q_s16(input + 24);
-  q12s16 = vld1q_s16(input + 32);
-  q13s16 = vld1q_s16(input + 40);
-  q14s16 = vld1q_s16(input + 48);
-  q15s16 = vld1q_s16(input + 56);
+  a0 = load_tran_low_to_s16d(input);
+  a1 = load_tran_low_to_s16d(input + 8);
+  a2 = load_tran_low_to_s16d(input + 16);
+  a3 = load_tran_low_to_s16d(input + 24);
 
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
-
-  // First transform rows
-  // stage 1
-  q0s16 = vdupq_n_s16(cospi_28_64 * 2);
-  q1s16 = vdupq_n_s16(cospi_4_64 * 2);
-
-  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-
-  q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
-
-  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
-  q1s16 = vdupq_n_s16(cospi_12_64 * 2);
-
-  q5s16 = vqrdmulhq_s16(q11s16, q0s16);
-
-  q0s16 = vdupq_n_s16(cospi_16_64 * 2);
-
-  q6s16 = vqrdmulhq_s16(q11s16, q1s16);
-
-  // stage 2 & stage 3 - even half
-  q1s16 = vdupq_n_s16(cospi_24_64 * 2);
-
-  q9s16 = vqrdmulhq_s16(q8s16, q0s16);
-
-  q0s16 = vdupq_n_s16(cospi_8_64 * 2);
-
-  q13s16 = vqrdmulhq_s16(q10s16, q1s16);
-
-  q15s16 = vqrdmulhq_s16(q10s16, q0s16);
-
-  // stage 3 -odd half
-  q0s16 = vaddq_s16(q9s16, q15s16);
-  q1s16 = vaddq_s16(q9s16, q13s16);
-  q2s16 = vsubq_s16(q9s16, q13s16);
-  q3s16 = vsubq_s16(q9s16, q15s16);
-
-  // stage 2 - odd half
-  q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q7s16, q6s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-
-  d16s16 = vdup_n_s16(cospi_16_64);
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-  q11s32 = vmull_s16(d28s16, d16s16);
-  q12s32 = vmull_s16(d29s16, d16s16);
-
-  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
-  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-  d10s16 = vqrshrn_n_s32(q9s32, 14);
-  d11s16 = vqrshrn_n_s32(q10s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q12s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  // stage 4
-  q8s16 = vaddq_s16(q0s16, q7s16);
-  q9s16 = vaddq_s16(q1s16, q6s16);
-  q10s16 = vaddq_s16(q2s16, q5s16);
-  q11s16 = vaddq_s16(q3s16, q4s16);
-  q12s16 = vsubq_s16(q3s16, q4s16);
-  q13s16 = vsubq_s16(q2s16, q5s16);
-  q14s16 = vsubq_s16(q1s16, q6s16);
-  q15s16 = vsubq_s16(q0s16, q7s16);
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
-
-  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-             &q15s16);
-
-  q8s16 = vrshrq_n_s16(q8s16, 5);
-  q9s16 = vrshrq_n_s16(q9s16, 5);
-  q10s16 = vrshrq_n_s16(q10s16, 5);
-  q11s16 = vrshrq_n_s16(q11s16, 5);
-  q12s16 = vrshrq_n_s16(q12s16, 5);
-  q13s16 = vrshrq_n_s16(q13s16, 5);
-  q14s16 = vrshrq_n_s16(q14s16, 5);
-  q15s16 = vrshrq_n_s16(q15s16, 5);
-
-  d1 = d2 = dest;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
-
-  q8s16 = q12s16;
-  q9s16 = q13s16;
-  q10s16 = q14s16;
-  q11s16 = q15s16;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
-  return;
+  idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &a0, &a1, &a2, &a3, &a4,
+                       &a5, &a6, &a7);
+  idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a0, a1, a2, a3, a4, a5, a6,
+                       a7, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
+  add8x8(b0, b1, b2, b3, b4, b5, b6, b7, dest, stride);
 }
diff --git a/libs/libvpx/vpx_dsp/arm/idct_neon.asm b/libs/libvpx/vpx_dsp/arm/idct_neon.asm
new file mode 100644
index 0000000000..5dd9bdc788
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct_neon.asm
@@ -0,0 +1,46 @@
+;
+;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    INCLUDE ./vpx_config.asm
+
+    ; Helper functions used to load tran_low_t into int16, narrowing if
+    ; necessary.
+
+    ; $dst0..3 are d registers with the pairs assumed to be contiguous in
+    ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth.
+    MACRO
+    LOAD_TRAN_LOW_TO_S16 $dst0, $dst1, $dst2, $dst3, $src
+    IF CONFIG_VP9_HIGHBITDEPTH
+    vld1.s32        {q0,q1}, [$src]!
+    vld1.s32        {q2,q3}, [$src]!
+    vmovn.i32       $dst0, q0
+    vmovn.i32       $dst1, q1
+    vmovn.i32       $dst2, q2
+    vmovn.i32       $dst3, q3
+    ELSE
+    vld1.s16        {$dst0-$dst1,$dst2-$dst3}, [$src]!
+    ENDIF
+    MEND
+
+    ; $dst0..3 are d registers. q0-q3 are used as temporaries in high-bitdepth.
+    MACRO
+    LOAD_TRAN_LOW_TO_S16X2 $dst0, $dst1, $dst2, $dst3, $src
+    IF CONFIG_VP9_HIGHBITDEPTH
+    vld2.s32        {q0,q1}, [$src]!
+    vld2.s32        {q2,q3}, [$src]!
+    vmovn.i32       $dst0, q0
+    vmovn.i32       $dst1, q2
+    vmovn.i32       $dst2, q1
+    vmovn.i32       $dst3, q3
+    ELSE
+    vld2.s16        {$dst0,$dst1,$dst2,$dst3}, [$src]!
+    ENDIF
+    MEND
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/idct_neon.h b/libs/libvpx/vpx_dsp/arm/idct_neon.h
new file mode 100644
index 0000000000..6ed02af5ac
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/idct_neon.h
@@ -0,0 +1,940 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_IDCT_NEON_H_
+#define VPX_DSP_ARM_IDCT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+static const int16_t kCospi[16] = {
+  16384 /*  cospi_0_64  */, 15137 /*  cospi_8_64  */,
+  11585 /*  cospi_16_64 */, 6270 /*  cospi_24_64 */,
+  16069 /*  cospi_4_64  */, 13623 /*  cospi_12_64 */,
+  -9102 /* -cospi_20_64 */, 3196 /*  cospi_28_64 */,
+  16305 /*  cospi_2_64  */, 1606 /*  cospi_30_64 */,
+  14449 /*  cospi_10_64 */, 7723 /*  cospi_22_64 */,
+  15679 /*  cospi_6_64  */, -4756 /* -cospi_26_64 */,
+  12665 /*  cospi_14_64 */, -10394 /* -cospi_18_64 */
+};
+
+static const int32_t kCospi32[16] = {
+  16384 /*  cospi_0_64  */, 15137 /*  cospi_8_64  */,
+  11585 /*  cospi_16_64 */, 6270 /*  cospi_24_64 */,
+  16069 /*  cospi_4_64  */, 13623 /*  cospi_12_64 */,
+  -9102 /* -cospi_20_64 */, 3196 /*  cospi_28_64 */,
+  16305 /*  cospi_2_64  */, 1606 /*  cospi_30_64 */,
+  14449 /*  cospi_10_64 */, 7723 /*  cospi_22_64 */,
+  15679 /*  cospi_6_64  */, -4756 /* -cospi_26_64 */,
+  12665 /*  cospi_14_64 */, -10394 /* -cospi_18_64 */
+};
+
+//------------------------------------------------------------------------------
+// Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth
+static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  return vqaddq_s16(a, b);
+#else
+  return vaddq_s16(a, b);
+#endif
+}
+
+static INLINE int16x8_t final_sub(const int16x8_t a, const int16x8_t b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  return vqsubq_s16(a, b);
+#else
+  return vsubq_s16(a, b);
+#endif
+}
+
+//------------------------------------------------------------------------------
+
+static INLINE int32x4x2_t highbd_idct_add_dual(const int32x4x2_t s0,
+                                               const int32x4x2_t s1) {
+  int32x4x2_t t;
+  t.val[0] = vaddq_s32(s0.val[0], s1.val[0]);
+  t.val[1] = vaddq_s32(s0.val[1], s1.val[1]);
+  return t;
+}
+
+static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0,
+                                               const int32x4x2_t s1) {
+  int32x4x2_t t;
+  t.val[0] = vsubq_s32(s0.val[0], s1.val[0]);
+  t.val[1] = vsubq_s32(s0.val[1], s1.val[1]);
+  return t;
+}
+
+//------------------------------------------------------------------------------
+
+// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
+static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
+                                                      const int16_t a_const) {
+  // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed
+  // streams. See WRAPLOW and dct_const_round_shift for details.
+  // This instruction doubles the result and returns the high half, essentially
+  // resulting in a right shift by 15. By multiplying the constant first that
+  // becomes a right shift by DCT_CONST_BITS.
+  // The largest possible value used here is
+  // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
+  // within the range of int16_t (+32767 / -32768) even when negated.
+  return vqrdmulhq_n_s16(a, a_const * 2);
+}
+
+// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
+static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
+    const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
+  // In both add_ and it's pair, sub_, the input for well-formed streams will be
+  // well within 16 bits (input to the idct is the difference between two frames
+  // and will be within -255 to 255, or 9 bits)
+  // However, for inputs over about 25,000 (valid for int16_t, but not for idct
+  // input) this function can not use vaddq_s16.
+  // In order to match existing behavior and intentionally out of range tests,
+  // expand the addition up to 32 bits to prevent truncation.
+  int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+  int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+  temp_low = vmulq_n_s32(temp_low, ab_const);
+  temp_high = vmulq_n_s32(temp_high, ab_const);
+  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
+}
+
+// Subtract b from a, then multiply by ab_const. Shift and narrow by
+// DCT_CONST_BITS.
+static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
+    const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
+  int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+  int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+  temp_low = vmulq_n_s32(temp_low, ab_const);
+  temp_high = vmulq_n_s32(temp_high, ab_const);
+  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
+}
+
+// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
+// DCT_CONST_BITS.
+static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
+    const int16x8_t a, const int16_t a_const, const int16x8_t b,
+    const int16_t b_const) {
+  int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const);
+  int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
+  temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
+  temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
+  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
+}
+
+//------------------------------------------------------------------------------
+
+// Note: The following 4 functions could use 32-bit operations for bit-depth 10.
+//       However, although it's 20% faster with gcc, it's 20% slower with clang.
+//       Use 64-bit operations for now.
+
+// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
+static INLINE int32x4x2_t
+multiply_shift_and_narrow_s32_dual(const int32x4x2_t a, const int32_t a_const) {
+  int64x2_t b[4];
+  int32x4x2_t c;
+  b[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
+  b[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
+  b[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
+  b[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const);
+  c.val[0] = vcombine_s32(vrshrn_n_s64(b[0], DCT_CONST_BITS),
+                          vrshrn_n_s64(b[1], DCT_CONST_BITS));
+  c.val[1] = vcombine_s32(vrshrn_n_s64(b[2], DCT_CONST_BITS),
+                          vrshrn_n_s64(b[3], DCT_CONST_BITS));
+  return c;
+}
+
+// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
+static INLINE int32x4x2_t add_multiply_shift_and_narrow_s32_dual(
+    const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
+  const int32x4_t temp_low = vaddq_s32(a.val[0], b.val[0]);
+  const int32x4_t temp_high = vaddq_s32(a.val[1], b.val[1]);
+  int64x2_t c[4];
+  int32x4x2_t d;
+  c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const);
+  c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const);
+  c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const);
+  c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const);
+  d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
+                          vrshrn_n_s64(c[1], DCT_CONST_BITS));
+  d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
+                          vrshrn_n_s64(c[3], DCT_CONST_BITS));
+  return d;
+}
+
+// Subtract b from a, then multiply by ab_const. Shift and narrow by
+// DCT_CONST_BITS.
+static INLINE int32x4x2_t sub_multiply_shift_and_narrow_s32_dual(
+    const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
+  const int32x4_t temp_low = vsubq_s32(a.val[0], b.val[0]);
+  const int32x4_t temp_high = vsubq_s32(a.val[1], b.val[1]);
+  int64x2_t c[4];
+  int32x4x2_t d;
+  c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const);
+  c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const);
+  c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const);
+  c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const);
+  d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
+                          vrshrn_n_s64(c[1], DCT_CONST_BITS));
+  d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
+                          vrshrn_n_s64(c[3], DCT_CONST_BITS));
+  return d;
+}
+
+// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
+// DCT_CONST_BITS.
+static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual(
+    const int32x4x2_t a, const int32_t a_const, const int32x4x2_t b,
+    const int32_t b_const) {
+  int64x2_t c[4];
+  int32x4x2_t d;
+  c[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
+  c[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
+  c[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
+  c[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const);
+  c[0] = vmlal_n_s32(c[0], vget_low_s32(b.val[0]), b_const);
+  c[1] = vmlal_n_s32(c[1], vget_high_s32(b.val[0]), b_const);
+  c[2] = vmlal_n_s32(c[2], vget_low_s32(b.val[1]), b_const);
+  c[3] = vmlal_n_s32(c[3], vget_high_s32(b.val[1]), b_const);
+  d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
+                          vrshrn_n_s64(c[1], DCT_CONST_BITS));
+  d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
+                          vrshrn_n_s64(c[3], DCT_CONST_BITS));
+  return d;
+}
+
+// Shift the output down by 6 and add it to the destination buffer.
+static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
+                                        const int16x8_t a2, const int16x8_t a3,
+                                        const int16x8_t a4, const int16x8_t a5,
+                                        const int16x8_t a6, const int16x8_t a7,
+                                        uint8_t *b, const int b_stride) {
+  uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+  b0 = vld1_u8(b);
+  b += b_stride;
+  b1 = vld1_u8(b);
+  b += b_stride;
+  b2 = vld1_u8(b);
+  b += b_stride;
+  b3 = vld1_u8(b);
+  b += b_stride;
+  b4 = vld1_u8(b);
+  b += b_stride;
+  b5 = vld1_u8(b);
+  b += b_stride;
+  b6 = vld1_u8(b);
+  b += b_stride;
+  b7 = vld1_u8(b);
+  b -= (7 * b_stride);
+
+  // c = b + (a >> 6)
+  c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6);
+  c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6);
+  c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6);
+  c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6);
+  c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6);
+  c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6);
+  c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6);
+  c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6);
+
+  b0 = vqmovun_s16(c0);
+  b1 = vqmovun_s16(c1);
+  b2 = vqmovun_s16(c2);
+  b3 = vqmovun_s16(c3);
+  b4 = vqmovun_s16(c4);
+  b5 = vqmovun_s16(c5);
+  b6 = vqmovun_s16(c6);
+  b7 = vqmovun_s16(c7);
+
+  vst1_u8(b, b0);
+  b += b_stride;
+  vst1_u8(b, b1);
+  b += b_stride;
+  vst1_u8(b, b2);
+  b += b_stride;
+  vst1_u8(b, b3);
+  b += b_stride;
+  vst1_u8(b, b4);
+  b += b_stride;
+  vst1_u8(b, b5);
+  b += b_stride;
+  vst1_u8(b, b6);
+  b += b_stride;
+  vst1_u8(b, b7);
+}
+
+static INLINE uint8x16_t create_dcq(const int16_t dc) {
+  // Clip both sides and gcc may compile to assembly 'usat'.
+  const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc);
+  return vdupq_n_u8((uint8_t)t);
+}
+
+static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
+                                         int16x8_t *const a0,
+                                         int16x8_t *const a1) {
+  int16x4_t b0, b1, b2, b3;
+  int32x4_t c0, c1, c2, c3;
+  int16x8_t d0, d1;
+
+  transpose_s16_4x4q(a0, a1);
+  b0 = vget_low_s16(*a0);
+  b1 = vget_high_s16(*a0);
+  b2 = vget_low_s16(*a1);
+  b3 = vget_high_s16(*a1);
+  c0 = vmull_lane_s16(b0, cospis, 2);
+  c2 = vmull_lane_s16(b1, cospis, 2);
+  c1 = vsubq_s32(c0, c2);
+  c0 = vaddq_s32(c0, c2);
+  c2 = vmull_lane_s16(b2, cospis, 3);
+  c3 = vmull_lane_s16(b2, cospis, 1);
+  c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
+  c3 = vmlal_lane_s16(c3, b3, cospis, 3);
+  b0 = vrshrn_n_s32(c0, DCT_CONST_BITS);
+  b1 = vrshrn_n_s32(c1, DCT_CONST_BITS);
+  b2 = vrshrn_n_s32(c2, DCT_CONST_BITS);
+  b3 = vrshrn_n_s32(c3, DCT_CONST_BITS);
+  d0 = vcombine_s16(b0, b1);
+  d1 = vcombine_s16(b3, b2);
+  *a0 = vaddq_s16(d0, d1);
+  *a1 = vsubq_s16(d0, d1);
+}
+
+static INLINE void idct8x8_12_pass1_bd8(
+    const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
+    int16x4_t *const io0, int16x4_t *const io1, int16x4_t *const io2,
+    int16x4_t *const io3, int16x4_t *const io4, int16x4_t *const io5,
+    int16x4_t *const io6, int16x4_t *const io7) {
+  int16x4_t step1[8], step2[8];
+  int32x4_t t32[2];
+
+  transpose_s16_4x4d(io0, io1, io2, io3);
+
+  // stage 1
+  step1[4] = vqrdmulh_lane_s16(*io1, cospisd1, 3);
+  step1[5] = vqrdmulh_lane_s16(*io3, cospisd1, 2);
+  step1[6] = vqrdmulh_lane_s16(*io3, cospisd1, 1);
+  step1[7] = vqrdmulh_lane_s16(*io1, cospisd1, 0);
+
+  // stage 2
+  step2[1] = vqrdmulh_lane_s16(*io0, cospisd0, 2);
+  step2[2] = vqrdmulh_lane_s16(*io2, cospisd0, 3);
+  step2[3] = vqrdmulh_lane_s16(*io2, cospisd0, 1);
+
+  step2[4] = vadd_s16(step1[4], step1[5]);
+  step2[5] = vsub_s16(step1[4], step1[5]);
+  step2[6] = vsub_s16(step1[7], step1[6]);
+  step2[7] = vadd_s16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vadd_s16(step2[1], step2[3]);
+  step1[1] = vadd_s16(step2[1], step2[2]);
+  step1[2] = vsub_s16(step2[1], step2[2]);
+  step1[3] = vsub_s16(step2[1], step2[3]);
+
+  t32[1] = vmull_lane_s16(step2[6], cospis0, 2);
+  t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2);
+  t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2);
+  step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+
+  // stage 4
+  *io0 = vadd_s16(step1[0], step2[7]);
+  *io1 = vadd_s16(step1[1], step1[6]);
+  *io2 = vadd_s16(step1[2], step1[5]);
+  *io3 = vadd_s16(step1[3], step2[4]);
+  *io4 = vsub_s16(step1[3], step2[4]);
+  *io5 = vsub_s16(step1[2], step1[5]);
+  *io6 = vsub_s16(step1[1], step1[6]);
+  *io7 = vsub_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_pass2_bd8(
+    const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
+    const int16x4_t input0, const int16x4_t input1, const int16x4_t input2,
+    const int16x4_t input3, const int16x4_t input4, const int16x4_t input5,
+    const int16x4_t input6, const int16x4_t input7, int16x8_t *const output0,
+    int16x8_t *const output1, int16x8_t *const output2,
+    int16x8_t *const output3, int16x8_t *const output4,
+    int16x8_t *const output5, int16x8_t *const output6,
+    int16x8_t *const output7) {
+  int16x8_t in[4];
+  int16x8_t step1[8], step2[8];
+  int32x4_t t32[8];
+  int16x4_t t16[8];
+
+  transpose_s16_4x8(input0, input1, input2, input3, input4, input5, input6,
+                    input7, &in[0], &in[1], &in[2], &in[3]);
+
+  // stage 1
+  step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3);
+  step1[5] = vqrdmulhq_lane_s16(in[3], cospisd1, 2);
+  step1[6] = vqrdmulhq_lane_s16(in[3], cospisd1, 1);
+  step1[7] = vqrdmulhq_lane_s16(in[1], cospisd1, 0);
+
+  // stage 2
+  step2[1] = vqrdmulhq_lane_s16(in[0], cospisd0, 2);
+  step2[2] = vqrdmulhq_lane_s16(in[2], cospisd0, 3);
+  step2[3] = vqrdmulhq_lane_s16(in[2], cospisd0, 1);
+
+  step2[4] = vaddq_s16(step1[4], step1[5]);
+  step2[5] = vsubq_s16(step1[4], step1[5]);
+  step2[6] = vsubq_s16(step1[7], step1[6]);
+  step2[7] = vaddq_s16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s16(step2[1], step2[3]);
+  step1[1] = vaddq_s16(step2[1], step2[2]);
+  step1[2] = vsubq_s16(step2[1], step2[2]);
+  step1[3] = vsubq_s16(step2[1], step2[3]);
+
+  t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2);
+  t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2);
+  t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+  t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
+  step1[5] = vcombine_s16(t16[0], t16[1]);
+  step1[6] = vcombine_s16(t16[2], t16[3]);
+
+  // stage 4
+  *output0 = vaddq_s16(step1[0], step2[7]);
+  *output1 = vaddq_s16(step1[1], step1[6]);
+  *output2 = vaddq_s16(step1[2], step1[5]);
+  *output3 = vaddq_s16(step1[3], step2[4]);
+  *output4 = vsubq_s16(step1[3], step2[4]);
+  *output5 = vsubq_s16(step1[2], step1[5]);
+  *output6 = vsubq_s16(step1[1], step1[6]);
+  *output7 = vsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
+                                     const int16x4_t cospis1,
+                                     int16x8_t *const io0, int16x8_t *const io1,
+                                     int16x8_t *const io2, int16x8_t *const io3,
+                                     int16x8_t *const io4, int16x8_t *const io5,
+                                     int16x8_t *const io6,
+                                     int16x8_t *const io7) {
+  int16x4_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
+      input_7l, input_7h;
+  int16x4_t step1l[4], step1h[4];
+  int16x8_t step1[8], step2[8];
+  int32x4_t t32[8];
+  int16x4_t t16[8];
+
+  transpose_s16_8x8(io0, io1, io2, io3, io4, io5, io6, io7);
+
+  // stage 1
+  input_1l = vget_low_s16(*io1);
+  input_1h = vget_high_s16(*io1);
+  input_3l = vget_low_s16(*io3);
+  input_3h = vget_high_s16(*io3);
+  input_5l = vget_low_s16(*io5);
+  input_5h = vget_high_s16(*io5);
+  input_7l = vget_low_s16(*io7);
+  input_7h = vget_high_s16(*io7);
+  step1l[0] = vget_low_s16(*io0);
+  step1h[0] = vget_high_s16(*io0);
+  step1l[1] = vget_low_s16(*io2);
+  step1h[1] = vget_high_s16(*io2);
+  step1l[2] = vget_low_s16(*io4);
+  step1h[2] = vget_high_s16(*io4);
+  step1l[3] = vget_low_s16(*io6);
+  step1h[3] = vget_high_s16(*io6);
+
+  t32[0] = vmull_lane_s16(input_1l, cospis1, 3);
+  t32[1] = vmull_lane_s16(input_1h, cospis1, 3);
+  t32[2] = vmull_lane_s16(input_3l, cospis1, 2);
+  t32[3] = vmull_lane_s16(input_3h, cospis1, 2);
+  t32[4] = vmull_lane_s16(input_3l, cospis1, 1);
+  t32[5] = vmull_lane_s16(input_3h, cospis1, 1);
+  t32[6] = vmull_lane_s16(input_1l, cospis1, 0);
+  t32[7] = vmull_lane_s16(input_1h, cospis1, 0);
+  t32[0] = vmlsl_lane_s16(t32[0], input_7l, cospis1, 0);
+  t32[1] = vmlsl_lane_s16(t32[1], input_7h, cospis1, 0);
+  t32[2] = vmlal_lane_s16(t32[2], input_5l, cospis1, 1);
+  t32[3] = vmlal_lane_s16(t32[3], input_5h, cospis1, 1);
+  t32[4] = vmlsl_lane_s16(t32[4], input_5l, cospis1, 2);
+  t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2);
+  t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3);
+  t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3);
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
+  t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
+  t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
+  t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
+  t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
+  step1[4] = vcombine_s16(t16[0], t16[1]);
+  step1[5] = vcombine_s16(t16[2], t16[3]);
+  step1[6] = vcombine_s16(t16[4], t16[5]);
+  step1[7] = vcombine_s16(t16[6], t16[7]);
+
+  // stage 2
+  t32[2] = vmull_lane_s16(step1l[0], cospis0, 2);
+  t32[3] = vmull_lane_s16(step1h[0], cospis0, 2);
+  t32[4] = vmull_lane_s16(step1l[1], cospis0, 3);
+  t32[5] = vmull_lane_s16(step1h[1], cospis0, 3);
+  t32[6] = vmull_lane_s16(step1l[1], cospis0, 1);
+  t32[7] = vmull_lane_s16(step1h[1], cospis0, 1);
+  t32[0] = vmlal_lane_s16(t32[2], step1l[2], cospis0, 2);
+  t32[1] = vmlal_lane_s16(t32[3], step1h[2], cospis0, 2);
+  t32[2] = vmlsl_lane_s16(t32[2], step1l[2], cospis0, 2);
+  t32[3] = vmlsl_lane_s16(t32[3], step1h[2], cospis0, 2);
+  t32[4] = vmlsl_lane_s16(t32[4], step1l[3], cospis0, 1);
+  t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);
+  t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);
+  t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
+  t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
+  t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
+  t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
+  t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
+  step2[0] = vcombine_s16(t16[0], t16[1]);
+  step2[1] = vcombine_s16(t16[2], t16[3]);
+  step2[2] = vcombine_s16(t16[4], t16[5]);
+  step2[3] = vcombine_s16(t16[6], t16[7]);
+
+  step2[4] = vaddq_s16(step1[4], step1[5]);
+  step2[5] = vsubq_s16(step1[4], step1[5]);
+  step2[6] = vsubq_s16(step1[7], step1[6]);
+  step2[7] = vaddq_s16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s16(step2[0], step2[3]);
+  step1[1] = vaddq_s16(step2[1], step2[2]);
+  step1[2] = vsubq_s16(step2[1], step2[2]);
+  step1[3] = vsubq_s16(step2[0], step2[3]);
+
+  t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2);
+  t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2);
+  t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+  t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
+  step1[5] = vcombine_s16(t16[0], t16[1]);
+  step1[6] = vcombine_s16(t16[2], t16[3]);
+
+  // stage 4
+  *io0 = vaddq_s16(step1[0], step2[7]);
+  *io1 = vaddq_s16(step1[1], step1[6]);
+  *io2 = vaddq_s16(step1[2], step1[5]);
+  *io3 = vaddq_s16(step1[3], step2[4]);
+  *io4 = vsubq_s16(step1[3], step2[4]);
+  *io5 = vsubq_s16(step1[2], step1[5]);
+  *io6 = vsubq_s16(step1[1], step1[6]);
+  *io7 = vsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32,
+                                              int16x8_t *const d0,
+                                              int16x8_t *const d1) {
+  int16x4_t t16[4];
+
+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
+  *d0 = vcombine_s16(t16[0], t16[1]);
+  *d1 = vcombine_s16(t16[2], t16[3]);
+}
+
+static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0,
+                                            const int16x8_t s1,
+                                            const int16x4_t cospi_0_8_16_24,
+                                            int32x4_t *const t32) {
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3);
+  t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1);
+  t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1);
+}
+
+static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1,
+                                     const int16x4_t cospi_0_8_16_24,
+                                     int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x4_t cospi_0_8_16_24,
+                                         int16x8_t *const d0,
+                                         int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
+  t32[2] = vnegq_s32(t32[2]);
+  t32[3] = vnegq_s32(t32[3]);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x4_t cospi_0_8_16_24,
+                                      int16x8_t *const d0,
+                                      int16x8_t *const d1) {
+  int32x4_t t32[6];
+
+  t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2);
+  t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2);
+  t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
+  t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
+  t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
+  t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
+                                   const int16x4_t cospi_2_30_10_22,
+                                   int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1);
+  t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0);
+  t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
+                                   const int16x4_t cospi_4_12_20N_28,
+                                   int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3);
+  t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0);
+  t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
+                                   const int16x4_t cospi_6_26N_14_18N,
+                                   int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 0);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 0);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 0);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 0);
+  t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 1);
+  t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 1);
+  t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 1);
+  t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 1);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x4_t cospi_2_30_10_22,
+                                    int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3);
+  t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2);
+  t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x4_t cospi_4_12_20N_28,
+                                    int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1);
+  t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2);
+  t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
+  t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
+  t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x4_t cospi_6_26N_14_18N,
+                                    int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 2);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 2);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 2);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 2);
+  t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 3);
+  t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 3);
+  t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 3);
+  t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 3);
+  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct16x16_add_stage7(const int16x8_t *const step2,
+                                        int16x8_t *const out) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // Use saturating add/sub to avoid overflow in 2nd pass
+  out[0] = vqaddq_s16(step2[0], step2[15]);
+  out[1] = vqaddq_s16(step2[1], step2[14]);
+  out[2] = vqaddq_s16(step2[2], step2[13]);
+  out[3] = vqaddq_s16(step2[3], step2[12]);
+  out[4] = vqaddq_s16(step2[4], step2[11]);
+  out[5] = vqaddq_s16(step2[5], step2[10]);
+  out[6] = vqaddq_s16(step2[6], step2[9]);
+  out[7] = vqaddq_s16(step2[7], step2[8]);
+  out[8] = vqsubq_s16(step2[7], step2[8]);
+  out[9] = vqsubq_s16(step2[6], step2[9]);
+  out[10] = vqsubq_s16(step2[5], step2[10]);
+  out[11] = vqsubq_s16(step2[4], step2[11]);
+  out[12] = vqsubq_s16(step2[3], step2[12]);
+  out[13] = vqsubq_s16(step2[2], step2[13]);
+  out[14] = vqsubq_s16(step2[1], step2[14]);
+  out[15] = vqsubq_s16(step2[0], step2[15]);
+#else
+  out[0] = vaddq_s16(step2[0], step2[15]);
+  out[1] = vaddq_s16(step2[1], step2[14]);
+  out[2] = vaddq_s16(step2[2], step2[13]);
+  out[3] = vaddq_s16(step2[3], step2[12]);
+  out[4] = vaddq_s16(step2[4], step2[11]);
+  out[5] = vaddq_s16(step2[5], step2[10]);
+  out[6] = vaddq_s16(step2[6], step2[9]);
+  out[7] = vaddq_s16(step2[7], step2[8]);
+  out[8] = vsubq_s16(step2[7], step2[8]);
+  out[9] = vsubq_s16(step2[6], step2[9]);
+  out[10] = vsubq_s16(step2[5], step2[10]);
+  out[11] = vsubq_s16(step2[4], step2[11]);
+  out[12] = vsubq_s16(step2[3], step2[12]);
+  out[13] = vsubq_s16(step2[2], step2[13]);
+  out[14] = vsubq_s16(step2[1], step2[14]);
+  out[15] = vsubq_s16(step2[0], step2[15]);
+#endif
+}
+
+static INLINE void idct16x16_store_pass1(const int16x8_t *const out,
+                                         int16_t *output) {
+  // Save the result into output
+  vst1q_s16(output, out[0]);
+  output += 16;
+  vst1q_s16(output, out[1]);
+  output += 16;
+  vst1q_s16(output, out[2]);
+  output += 16;
+  vst1q_s16(output, out[3]);
+  output += 16;
+  vst1q_s16(output, out[4]);
+  output += 16;
+  vst1q_s16(output, out[5]);
+  output += 16;
+  vst1q_s16(output, out[6]);
+  output += 16;
+  vst1q_s16(output, out[7]);
+  output += 16;
+  vst1q_s16(output, out[8]);
+  output += 16;
+  vst1q_s16(output, out[9]);
+  output += 16;
+  vst1q_s16(output, out[10]);
+  output += 16;
+  vst1q_s16(output, out[11]);
+  output += 16;
+  vst1q_s16(output, out[12]);
+  output += 16;
+  vst1q_s16(output, out[13]);
+  output += 16;
+  vst1q_s16(output, out[14]);
+  output += 16;
+  vst1q_s16(output, out[15]);
+}
+
+static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest,
+                                    const int stride) {
+  uint8x8_t d = vld1_u8(*dest);
+  uint16x8_t q;
+
+  res = vrshrq_n_s16(res, 6);
+  q = vaddw_u8(vreinterpretq_u16_s16(res), d);
+  d = vqmovun_s16(vreinterpretq_s16_u16(q));
+  vst1_u8(*dest, d);
+  *dest += stride;
+}
+
+static INLINE void highbd_idct16x16_add8x1(int16x8_t res, const int16x8_t max,
+                                           uint16_t **dest, const int stride) {
+  uint16x8_t d = vld1q_u16(*dest);
+
+  res = vqaddq_s16(res, vreinterpretq_s16_u16(d));
+  res = vminq_s16(res, max);
+  d = vqshluq_n_s16(res, 0);
+  vst1q_u16(*dest, d);
+  *dest += stride;
+}
+
+static INLINE void highbd_idct16x16_add8x1_bd8(int16x8_t res, uint16_t **dest,
+                                               const int stride) {
+  uint16x8_t d = vld1q_u16(*dest);
+
+  res = vrsraq_n_s16(vreinterpretq_s16_u16(d), res, 6);
+  d = vmovl_u8(vqmovun_s16(res));
+  vst1q_u16(*dest, d);
+  *dest += stride;
+}
+
+static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a,
+                                            uint16_t *out, const int b_stride) {
+  highbd_idct16x16_add8x1_bd8(a[0], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[1], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[2], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[3], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[4], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[5], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[6], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[7], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[8], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[9], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[10], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[11], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[12], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[13], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[14], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[15], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[16], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[17], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[18], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[19], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[20], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[21], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[22], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[23], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[24], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[25], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[26], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[27], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[28], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[29], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[30], &out, b_stride);
+  highbd_idct16x16_add8x1_bd8(a[31], &out, b_stride);
+}
+
+static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out,
+                                              uint16_t *dest, const int stride,
+                                              const int bd) {
+  // Add the result to dest
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  int16x8_t o[16];
+  o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6),
+                      vrshrn_n_s32(out[0].val[1], 6));
+  o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6),
+                      vrshrn_n_s32(out[1].val[1], 6));
+  o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6),
+                      vrshrn_n_s32(out[2].val[1], 6));
+  o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6),
+                      vrshrn_n_s32(out[3].val[1], 6));
+  o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6),
+                      vrshrn_n_s32(out[4].val[1], 6));
+  o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6),
+                      vrshrn_n_s32(out[5].val[1], 6));
+  o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6),
+                      vrshrn_n_s32(out[6].val[1], 6));
+  o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6),
+                      vrshrn_n_s32(out[7].val[1], 6));
+  o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6),
+                      vrshrn_n_s32(out[8].val[1], 6));
+  o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6),
+                      vrshrn_n_s32(out[9].val[1], 6));
+  o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6),
+                       vrshrn_n_s32(out[10].val[1], 6));
+  o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6),
+                       vrshrn_n_s32(out[11].val[1], 6));
+  o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6),
+                       vrshrn_n_s32(out[12].val[1], 6));
+  o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6),
+                       vrshrn_n_s32(out[13].val[1], 6));
+  o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6),
+                       vrshrn_n_s32(out[14].val[1], 6));
+  o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6),
+                       vrshrn_n_s32(out[15].val[1], 6));
+  highbd_idct16x16_add8x1(o[0], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[1], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[2], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[3], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[4], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[5], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[6], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[7], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[8], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[9], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[10], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[11], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[12], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[13], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[14], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[15], max, &dest, stride);
+}
+
+void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
+                                  void *const dest, const int stride,
+                                  const int highbd_flag);
+
+void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output,
+                                 void *const dest, const int stride,
+                                 const int highbd_flag);
+
+void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+                                       int16_t *output);
+
+void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input,
+                                       int16_t *const output, void *const dest,
+                                       const int stride, const int highbd_flag);
+
+void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest,
+                        const int stride, const int highbd_flag);
+
+void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output);
+void vpx_idct32_16_neon(const int16_t *const input, void *const output,
+                        const int stride, const int highbd_flag);
+
+void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output);
+void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
+                       const int highbd_flag);
+
+#endif  // VPX_DSP_ARM_IDCT_NEON_H_
diff --git a/libs/libvpx/vpx_dsp/arm/intrapred_neon.c b/libs/libvpx/vpx_dsp/arm/intrapred_neon.c
index 38e79ed69d..fb1fa6b681 100644
--- a/libs/libvpx/vpx_dsp/arm/intrapred_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/intrapred_neon.c
@@ -17,306 +17,254 @@
 //------------------------------------------------------------------------------
 // DC 4x4
 
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
-                          const uint8_t *left, int do_above, int do_left) {
-  uint16x4_t sum_top;
-  uint16x4_t sum_left;
-  uint16x4_t dc0;
+static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) {
+  const uint8x8_t ref_u8 = vld1_u8(ref);
+  const uint16x4_t p0 = vpaddl_u8(ref_u8);
+  return vpadd_u16(p0, p0);
+}
 
-  if (do_above) {
-    const uint8x8_t A = vld1_u8(above);  // top row
-    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
-    sum_top = vpadd_u16(p0, p0);
-  }
-
-  if (do_left) {
-    const uint8x8_t L = vld1_u8(left);   // left border
-    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
-    sum_left = vpadd_u16(p0, p0);
-  }
-
-  if (do_above && do_left) {
-    const uint16x4_t sum = vadd_u16(sum_left, sum_top);
-    dc0 = vrshr_n_u16(sum, 3);
-  } else if (do_above) {
-    dc0 = vrshr_n_u16(sum_top, 2);
-  } else if (do_left) {
-    dc0 = vrshr_n_u16(sum_left, 2);
-  } else {
-    dc0 = vdup_n_u16(0x80);
-  }
-
-  {
-    const uint8x8_t dc = vdup_lane_u8(vreinterpret_u8_u16(dc0), 0);
-    int i;
-    for (i = 0; i < 4; ++i) {
-      vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
-    }
+static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride,
+                                const uint8x8_t dc) {
+  const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0);
   }
 }
 
 void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  dc_4x4(dst, stride, above, left, 1, 1);
+  const uint8x8_t a = vld1_u8(above);
+  const uint8x8_t l = vld1_u8(left);
+  const uint16x8_t al = vaddl_u8(a, l);
+  uint16x4_t sum;
+  uint8x8_t dc;
+  sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al));
+  sum = vpadd_u16(sum, sum);
+  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_4(left);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
   (void)above;
-  dc_4x4(dst, stride, NULL, left, 0, 1);
+  dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_4(above);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
   (void)left;
-  dc_4x4(dst, stride, above, NULL, 1, 0);
+  dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t dc = vdup_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_4x4(dst, stride, NULL, NULL, 0, 0);
+  dc_store_4x4(dst, stride, dc);
 }
 
 //------------------------------------------------------------------------------
 // DC 8x8
 
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
-                          const uint8_t *left, int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
+static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) {
+  const uint8x8_t ref_u8 = vld1_u8(ref);
+  uint16x4_t sum = vpaddl_u8(ref_u8);
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
 
-  if (do_above) {
-    const uint8x8_t A = vld1_u8(above);  // top row
-    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    sum_top = vcombine_u16(p2, p2);
-  }
-
-  if (do_left) {
-    const uint8x8_t L = vld1_u8(left);   // left border
-    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    sum_left = vcombine_u16(p2, p2);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 4);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 3);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 3);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 8; ++i) {
-      vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
-    }
+static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride,
+                                const uint8x8_t dc) {
+  const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
+  int i;
+  for (i = 0; i < 8; ++i, dst += stride) {
+    vst1_u8(dst, dc_dup);
   }
 }
 
 void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  dc_8x8(dst, stride, above, left, 1, 1);
+  const uint8x8_t above_u8 = vld1_u8(above);
+  const uint8x8_t left_u8 = vld1_u8(left);
+  const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8);
+  const uint16x8_t p0 = vpaddlq_u8(above_and_left);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+  uint8x8_t dc;
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_8(left);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
   (void)above;
-  dc_8x8(dst, stride, NULL, left, 0, 1);
+  dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_8(above);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
   (void)left;
-  dc_8x8(dst, stride, above, NULL, 1, 0);
+  dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t dc = vdup_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_8x8(dst, stride, NULL, NULL, 0, 0);
+  dc_store_8x8(dst, stride, dc);
 }
 
 //------------------------------------------------------------------------------
 // DC 16x16
 
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left,
-                            int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
+static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) {
+  const uint8x16_t ref_u8 = vld1q_u8(ref);
+  const uint16x8_t p0 = vpaddlq_u8(ref_u8);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
 
-  if (do_above) {
-    const uint8x16_t A = vld1q_u8(above);  // top row
-    const uint16x8_t p0 = vpaddlq_u8(A);   // cascading summation of the top
-    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    const uint16x4_t p3 = vpadd_u16(p2, p2);
-    sum_top = vcombine_u16(p3, p3);
-  }
-
-  if (do_left) {
-    const uint8x16_t L = vld1q_u8(left);  // left row
-    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    const uint16x4_t p3 = vpadd_u16(p2, p2);
-    sum_left = vcombine_u16(p3, p3);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 5);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 4);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 4);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 16; ++i) {
-      vst1q_u8(dst + i * stride, dc);
-    }
+static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8x8_t dc) {
+  const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0);
+  int i;
+  for (i = 0; i < 16; ++i, dst += stride) {
+    vst1q_u8(dst, dc_dup);
   }
 }
 
 void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  dc_16x16(dst, stride, above, left, 1, 1);
+  const uint8x16_t ref0 = vld1q_u8(above);
+  const uint8x16_t ref1 = vld1q_u8(left);
+  const uint16x8_t p0 = vpaddlq_u8(ref0);
+  const uint16x8_t p1 = vpaddlq_u8(ref1);
+  const uint16x8_t p2 = vaddq_u16(p0, p1);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+  uint8x8_t dc;
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_16(left);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
   (void)above;
-  dc_16x16(dst, stride, NULL, left, 0, 1);
+  dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_16(above);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
   (void)left;
-  dc_16x16(dst, stride, above, NULL, 1, 0);
+  dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
+  const uint8x8_t dc = vdup_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_16x16(dst, stride, NULL, NULL, 0, 0);
+  dc_store_16x16(dst, stride, dc);
 }
 
 //------------------------------------------------------------------------------
 // DC 32x32
 
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left,
-                            int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
+static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) {
+  const uint8x16x2_t r = vld2q_u8(ref);
+  const uint16x8_t p0 = vpaddlq_u8(r.val[0]);
+  const uint16x8_t p1 = vpaddlq_u8(r.val[1]);
+  const uint16x8_t p2 = vaddq_u16(p0, p1);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
 
-  if (do_above) {
-    const uint8x16_t A0 = vld1q_u8(above);  // top row
-    const uint8x16_t A1 = vld1q_u8(above + 16);
-    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
-    const uint16x8_t p1 = vpaddlq_u8(A1);
-    const uint16x8_t p2 = vaddq_u16(p0, p1);
-    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-    const uint16x4_t p4 = vpadd_u16(p3, p3);
-    const uint16x4_t p5 = vpadd_u16(p4, p4);
-    sum_top = vcombine_u16(p5, p5);
-  }
+static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8x8_t dc) {
+  uint8x16x2_t dc_dup;
+  int i;
+  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0);
 
-  if (do_left) {
-    const uint8x16_t L0 = vld1q_u8(left);  // left row
-    const uint8x16_t L1 = vld1q_u8(left + 16);
-    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
-    const uint16x8_t p1 = vpaddlq_u8(L1);
-    const uint16x8_t p2 = vaddq_u16(p0, p1);
-    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-    const uint16x4_t p4 = vpadd_u16(p3, p3);
-    const uint16x4_t p5 = vpadd_u16(p4, p4);
-    sum_left = vcombine_u16(p5, p5);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 6);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 5);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 5);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 32; ++i) {
-      vst1q_u8(dst + i * stride, dc);
-      vst1q_u8(dst + i * stride + 16, dc);
-    }
+  for (i = 0; i < 32; ++i, dst += stride) {
+    vst2q_u8(dst, dc_dup);
   }
 }
 
 void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  dc_32x32(dst, stride, above, left, 1, 1);
+  const uint8x16x2_t a = vld2q_u8(above);
+  const uint8x16x2_t l = vld2q_u8(left);
+  const uint16x8_t pa0 = vpaddlq_u8(a.val[0]);
+  const uint16x8_t pl0 = vpaddlq_u8(l.val[0]);
+  const uint16x8_t pa1 = vpaddlq_u8(a.val[1]);
+  const uint16x8_t pl1 = vpaddlq_u8(l.val[1]);
+  const uint16x8_t pa = vaddq_u16(pa0, pa1);
+  const uint16x8_t pl = vaddq_u16(pl0, pl1);
+  const uint16x8_t pal = vaddq_u16(pa, pl);
+  uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal));
+  uint8x8_t dc;
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6));
+  dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_32(left);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
   (void)above;
-  dc_32x32(dst, stride, NULL, left, 0, 1);
+  dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_32(above);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
   (void)left;
-  dc_32x32(dst, stride, above, NULL, 1, 0);
+  dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
+  const uint8x8_t dc = vdup_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_32x32(dst, stride, NULL, NULL, 0, 0);
+  dc_store_32x32(dst, stride, dc);
 }
 
 // -----------------------------------------------------------------------------
 
 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
-  const uint64x1_t A1 = vshr_n_u64(A0, 8);
-  const uint64x1_t A2 = vshr_n_u64(A0, 16);
-  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+  const uint8x8_t ABCDEFGH = vld1_u8(above);
+  const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8);
+  const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16);
   const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
   const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
   const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
@@ -331,485 +279,764 @@ void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
   vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
   vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
   vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
-  dst[3 * stride + 3] = above[7];
+  vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7);
+}
+
+static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride,
+                               const uint8x8_t above_right, uint8x8_t *row) {
+  *row = vext_u8(*row, above_right, 1);
+  vst1_u8(*dst, *row);
+  *dst += stride;
 }
 
 void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
-  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
-  const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
-  const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
-  const uint8x8_t A0 = vld1_u8(above);  // top row
-  const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
-  const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
+  const uint8x8_t A0 = vld1_u8(above);
+  const uint8x8_t above_right = vdup_lane_u8(A0, 7);
+  const uint8x8_t A1 = vext_u8(A0, above_right, 1);
+  const uint8x8_t A2 = vext_u8(A0, above_right, 2);
   const uint8x8_t avg1 = vhadd_u8(A0, A2);
   uint8x8_t row = vrhadd_u8(avg1, A1);
-  int i;
   (void)left;
-  for (i = 0; i < 7; ++i) {
-    vst1_u8(dst + i * stride, row);
-    row = vtbl1_u8(row, sh_12345677);
-  }
-  vst1_u8(dst + i * stride, row);
+
+  vst1_u8(dst, row);
+  dst += stride;
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  vst1_u8(dst, above_right);
+}
+
+static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride,
+                                const uint8x16_t above_right, uint8x16_t *row) {
+  *row = vextq_u8(*row, above_right, 1);
+  vst1q_u8(*dst, *row);
+  *dst += stride;
 }
 
 void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
-  const uint8x16_t A0 = vld1q_u8(above);  // top row
-  const uint8x16_t above_right = vld1q_dup_u8(above + 15);
+  const uint8x16_t A0 = vld1q_u8(above);
+  const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7);
   const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
   const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
   const uint8x16_t avg1 = vhaddq_u8(A0, A2);
   uint8x16_t row = vrhaddq_u8(avg1, A1);
+  (void)left;
+
+  vst1q_u8(dst, row);
+  dst += stride;
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  vst1q_u8(dst, above_right);
+}
+
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t A0_0 = vld1q_u8(above);
+  const uint8x16_t A0_1 = vld1q_u8(above + 16);
+  const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0_1), 7);
+  const uint8x16_t A1_0 = vld1q_u8(above + 1);
+  const uint8x16_t A1_1 = vld1q_u8(above + 17);
+  const uint8x16_t A2_0 = vld1q_u8(above + 2);
+  const uint8x16_t A2_1 = vld1q_u8(above + 18);
+  const uint8x16_t avg_0 = vhaddq_u8(A0_0, A2_0);
+  const uint8x16_t avg_1 = vhaddq_u8(A0_1, A2_1);
+  uint8x16_t row_0 = vrhaddq_u8(avg_0, A1_0);
+  uint8x16_t row_1 = vrhaddq_u8(avg_1, A1_1);
   int i;
   (void)left;
-  for (i = 0; i < 15; ++i) {
-    vst1q_u8(dst + i * stride, row);
-    row = vextq_u8(row, above_right, 1);
+
+  vst1q_u8(dst, row_0);
+  dst += 16;
+  vst1q_u8(dst, row_1);
+  dst += stride - 16;
+
+  for (i = 0; i < 30; ++i) {
+    row_0 = vextq_u8(row_0, row_1, 1);
+    row_1 = vextq_u8(row_1, above_right, 1);
+    vst1q_u8(dst, row_0);
+    dst += 16;
+    vst1q_u8(dst, row_1);
+    dst += stride - 16;
   }
-  vst1q_u8(dst + i * stride, row);
+
+  vst1q_u8(dst, above_right);
+  dst += 16;
+  vst1q_u8(dst, row_1);
 }
 
 // -----------------------------------------------------------------------------
 
 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
-  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
-  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
-  const uint32x2_t zero = vdup_n_u32(0);
-  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
-  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
-  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
-  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
-  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
-  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
-  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
-  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
-  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
-  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
-  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+  const uint8x8_t XA0123 = vld1_u8(above - 1);
+  const uint8x8_t L0123 = vld1_u8(left);
+  const uint8x8_t L3210 = vrev64_u8(L0123);
+  const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4);
+  const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5);
+  const uint8x8_t L10XA0123_ =
+      vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(L210XA0123), 8));
+  const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123);
   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
   const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
   const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
-  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
-  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
-  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
-  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+  vst1_lane_u32((uint32_t *)dst, r0, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, r1, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, r2, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, r3, 0);
 }
 
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t XA0123456 = vld1_u8(above - 1);
+  const uint8x8_t A01234567 = vld1_u8(above);
+  const uint8x8_t A1234567_ = vld1_u8(above + 1);
+  const uint8x8_t L01234567 = vld1_u8(left);
+  const uint8x8_t L76543210 = vrev64_u8(L01234567);
+  const uint8x8_t L6543210X = vext_u8(L76543210, XA0123456, 1);
+  const uint8x8_t L543210XA0 = vext_u8(L76543210, XA0123456, 2);
+  const uint8x16_t L76543210XA0123456 = vcombine_u8(L76543210, XA0123456);
+  const uint8x16_t L6543210XA01234567 = vcombine_u8(L6543210X, A01234567);
+  const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_);
+  const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_);
+  const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567);
+  const uint8x8_t row_0 = vget_low_u8(row);
+  const uint8x8_t row_1 = vget_high_u8(row);
+  const uint8x8_t r0 = vext_u8(row_0, row_1, 7);
+  const uint8x8_t r1 = vext_u8(row_0, row_1, 6);
+  const uint8x8_t r2 = vext_u8(row_0, row_1, 5);
+  const uint8x8_t r3 = vext_u8(row_0, row_1, 4);
+  const uint8x8_t r4 = vext_u8(row_0, row_1, 3);
+  const uint8x8_t r5 = vext_u8(row_0, row_1, 2);
+  const uint8x8_t r6 = vext_u8(row_0, row_1, 1);
+
+  vst1_u8(dst, r0);
+  dst += stride;
+  vst1_u8(dst, r1);
+  dst += stride;
+  vst1_u8(dst, r2);
+  dst += stride;
+  vst1_u8(dst, r3);
+  dst += stride;
+  vst1_u8(dst, r4);
+  dst += stride;
+  vst1_u8(dst, r5);
+  dst += stride;
+  vst1_u8(dst, r6);
+  dst += stride;
+  vst1_u8(dst, row_0);
+}
+
+static INLINE void d135_store_16x8(
+    uint8_t **dst, const ptrdiff_t stride, const uint8x16_t row_0,
+    const uint8x16_t row_1, const uint8x16_t row_2, const uint8x16_t row_3,
+    const uint8x16_t row_4, const uint8x16_t row_5, const uint8x16_t row_6,
+    const uint8x16_t row_7) {
+  vst1q_u8(*dst, row_0);
+  *dst += stride;
+  vst1q_u8(*dst, row_1);
+  *dst += stride;
+  vst1q_u8(*dst, row_2);
+  *dst += stride;
+  vst1q_u8(*dst, row_3);
+  *dst += stride;
+  vst1q_u8(*dst, row_4);
+  *dst += stride;
+  vst1q_u8(*dst, row_5);
+  *dst += stride;
+  vst1q_u8(*dst, row_6);
+  *dst += stride;
+  vst1q_u8(*dst, row_7);
+  *dst += stride;
+}
+
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t XA0123456789abcde = vld1q_u8(above - 1);
+  const uint8x16_t A0123456789abcdef = vld1q_u8(above);
+  const uint8x16_t A123456789abcdef_ = vld1q_u8(above + 1);
+  const uint8x16_t L0123456789abcdef = vld1q_u8(left);
+  const uint8x8_t L76543210 = vrev64_u8(vget_low_u8(L0123456789abcdef));
+  const uint8x8_t Lfedcba98 = vrev64_u8(vget_high_u8(L0123456789abcdef));
+  const uint8x16_t Lfedcba9876543210 = vcombine_u8(Lfedcba98, L76543210);
+  const uint8x16_t Ledcba9876543210X =
+      vextq_u8(Lfedcba9876543210, XA0123456789abcde, 1);
+  const uint8x16_t Ldcba9876543210XA0 =
+      vextq_u8(Lfedcba9876543210, XA0123456789abcde, 2);
+  const uint8x16_t avg_0 = vhaddq_u8(Lfedcba9876543210, Ldcba9876543210XA0);
+  const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_);
+  const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X);
+  const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef);
+  const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
+  const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14);
+  const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13);
+  const uint8x16_t r_3 = vextq_u8(row_0, row_1, 12);
+  const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11);
+  const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10);
+  const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9);
+  const uint8x16_t r_7 = vcombine_u8(vget_high_u8(row_0), vget_low_u8(row_1));
+  const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7);
+  const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6);
+  const uint8x16_t r_a = vextq_u8(row_0, row_1, 5);
+  const uint8x16_t r_b = vextq_u8(row_0, row_1, 4);
+  const uint8x16_t r_c = vextq_u8(row_0, row_1, 3);
+  const uint8x16_t r_d = vextq_u8(row_0, row_1, 2);
+  const uint8x16_t r_e = vextq_u8(row_0, row_1, 1);
+
+  d135_store_16x8(&dst, stride, r_0, r_1, r_2, r_3, r_4, r_5, r_6, r_7);
+  d135_store_16x8(&dst, stride, r_8, r_9, r_a, r_b, r_c, r_d, r_e, row_0);
+}
+
+static INLINE void d135_store_32x2(uint8_t **dst, const ptrdiff_t stride,
+                                   const uint8x16_t row_0,
+                                   const uint8x16_t row_1,
+                                   const uint8x16_t row_2) {
+  uint8_t *dst2 = *dst;
+  vst1q_u8(dst2, row_1);
+  dst2 += 16;
+  vst1q_u8(dst2, row_2);
+  dst2 += 16 * stride - 16;
+  vst1q_u8(dst2, row_0);
+  dst2 += 16;
+  vst1q_u8(dst2, row_1);
+  *dst += stride;
+}
+
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t LL0123456789abcdef = vld1q_u8(left + 16);
+  const uint8x16_t LU0123456789abcdef = vld1q_u8(left);
+  const uint8x8_t LL76543210 = vrev64_u8(vget_low_u8(LL0123456789abcdef));
+  const uint8x8_t LU76543210 = vrev64_u8(vget_low_u8(LU0123456789abcdef));
+  const uint8x8_t LLfedcba98 = vrev64_u8(vget_high_u8(LL0123456789abcdef));
+  const uint8x8_t LUfedcba98 = vrev64_u8(vget_high_u8(LU0123456789abcdef));
+  const uint8x16_t LLfedcba9876543210 = vcombine_u8(LLfedcba98, LL76543210);
+  const uint8x16_t LUfedcba9876543210 = vcombine_u8(LUfedcba98, LU76543210);
+  const uint8x16_t LLedcba9876543210Uf =
+      vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 1);
+  const uint8x16_t LLdcba9876543210Ufe =
+      vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 2);
+  const uint8x16_t avg_0 = vhaddq_u8(LLfedcba9876543210, LLdcba9876543210Ufe);
+  const uint8x16_t row_0 = vrhaddq_u8(avg_0, LLedcba9876543210Uf);
+
+  const uint8x16_t XAL0123456789abcde = vld1q_u8(above - 1);
+  const uint8x16_t LUedcba9876543210X =
+      vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 1);
+  const uint8x16_t LUdcba9876543210XA0 =
+      vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 2);
+  const uint8x16_t avg_1 = vhaddq_u8(LUfedcba9876543210, LUdcba9876543210XA0);
+  const uint8x16_t row_1 = vrhaddq_u8(avg_1, LUedcba9876543210X);
+
+  const uint8x16_t AL0123456789abcdef = vld1q_u8(above);
+  const uint8x16_t AL123456789abcdefg = vld1q_u8(above + 1);
+  const uint8x16_t ALfR0123456789abcde = vld1q_u8(above + 15);
+  const uint8x16_t AR0123456789abcdef = vld1q_u8(above + 16);
+  const uint8x16_t AR123456789abcdef_ = vld1q_u8(above + 17);
+  const uint8x16_t avg_2 = vhaddq_u8(XAL0123456789abcde, AL123456789abcdefg);
+  const uint8x16_t row_2 = vrhaddq_u8(avg_2, AL0123456789abcdef);
+  const uint8x16_t avg_3 = vhaddq_u8(ALfR0123456789abcde, AR123456789abcdef_);
+  const uint8x16_t row_3 = vrhaddq_u8(avg_3, AR0123456789abcdef);
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 15);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 15);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 14);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 14);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 14);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 13);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 13);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 13);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 12);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 12);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 12);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 11);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 11);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 11);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 10);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 10);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 10);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 9);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 9);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 9);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 8);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 8);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 8);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 7);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 7);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 7);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 6);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 6);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 6);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 5);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 5);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 5);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 4);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 4);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 4);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 3);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 3);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 3);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 2);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 2);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 2);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  {
+    const uint8x16_t r_0 = vextq_u8(row_0, row_1, 1);
+    const uint8x16_t r_1 = vextq_u8(row_1, row_2, 1);
+    const uint8x16_t r_2 = vextq_u8(row_2, row_3, 1);
+    d135_store_32x2(&dst, stride, r_0, r_1, r_2);
+  }
+
+  d135_store_32x2(&dst, stride, row_0, row_1, row_2);
+}
+
+// -----------------------------------------------------------------------------
+
 #if !HAVE_NEON_ASM
 
 void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
+  const uint32_t d = *(const uint32_t *)above;
   int i;
-  uint32x2_t d0u32 = vdup_n_u32(0);
   (void)left;
 
-  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
-  for (i = 0; i < 4; i++, dst += stride)
-    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+  for (i = 0; i < 4; i++, dst += stride) {
+    *(uint32_t *)dst = d;
+  }
 }
 
 void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d = vld1_u8(above);
   int i;
-  uint8x8_t d0u8 = vdup_n_u8(0);
   (void)left;
 
-  d0u8 = vld1_u8(above);
-  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
+  for (i = 0; i < 8; i++, dst += stride) {
+    vst1_u8(dst, d);
+  }
 }
 
 void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d = vld1q_u8(above);
   int i;
-  uint8x16_t q0u8 = vdupq_n_u8(0);
   (void)left;
 
-  q0u8 = vld1q_u8(above);
-  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
+  for (i = 0; i < 16; i++, dst += stride) {
+    vst1q_u8(dst, d);
+  }
 }
 
 void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
   int i;
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
   (void)left;
 
-  q0u8 = vld1q_u8(above);
-  q1u8 = vld1q_u8(above + 16);
-  for (i = 0; i < 32; i++, dst += stride) {
-    vst1q_u8(dst, q0u8);
-    vst1q_u8(dst + 16, q1u8);
+  for (i = 0; i < 32; i++) {
+    // Note: performance was worse using vst2q_u8 under gcc-4.9 & clang-3.8.
+    // clang-3.8 unrolled the loop fully with no filler so the cause is likely
+    // the latency of the instruction.
+    vst1q_u8(dst, d0);
+    dst += 16;
+    vst1q_u8(dst, d1);
+    dst += stride - 16;
   }
 }
 
+// -----------------------------------------------------------------------------
+
 void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint32x2_t d1u32 = vdup_n_u32(0);
+  const uint32x2_t zero = vdup_n_u32(0);
+  const uint8x8_t left_u8 =
+      vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)left, zero, 0));
+  uint8x8_t d;
   (void)above;
 
-  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
-
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  d = vdup_lane_u8(left_u8, 0);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  d = vdup_lane_u8(left_u8, 1);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  d = vdup_lane_u8(left_u8, 2);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  d = vdup_lane_u8(left_u8, 3);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
 }
 
 void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint64x1_t d1u64 = vdup_n_u64(0);
+  const uint8x8_t left_u8 = vld1_u8(left);
+  uint8x8_t d;
   (void)above;
 
-  d1u64 = vld1_u64((const uint64_t *)left);
+  d = vdup_lane_u8(left_u8, 0);
+  vst1_u8(dst, d);
+  dst += stride;
+  d = vdup_lane_u8(left_u8, 1);
+  vst1_u8(dst, d);
+  dst += stride;
+  d = vdup_lane_u8(left_u8, 2);
+  vst1_u8(dst, d);
+  dst += stride;
+  d = vdup_lane_u8(left_u8, 3);
+  vst1_u8(dst, d);
+  dst += stride;
+  d = vdup_lane_u8(left_u8, 4);
+  vst1_u8(dst, d);
+  dst += stride;
+  d = vdup_lane_u8(left_u8, 5);
+  vst1_u8(dst, d);
+  dst += stride;
+  d = vdup_lane_u8(left_u8, 6);
+  vst1_u8(dst, d);
+  dst += stride;
+  d = vdup_lane_u8(left_u8, 7);
+  vst1_u8(dst, d);
+}
 
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
-  vst1_u8(dst, d0u8);
+static INLINE void h_store_16x8(uint8_t **dst, const ptrdiff_t stride,
+                                const uint8x8_t left) {
+  const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
+  const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
+  const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
+  const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
+  const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
+  const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
+  const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
+  const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
+
+  vst1q_u8(*dst, row_0);
+  *dst += stride;
+  vst1q_u8(*dst, row_1);
+  *dst += stride;
+  vst1q_u8(*dst, row_2);
+  *dst += stride;
+  vst1q_u8(*dst, row_3);
+  *dst += stride;
+  vst1q_u8(*dst, row_4);
+  *dst += stride;
+  vst1q_u8(*dst, row_5);
+  *dst += stride;
+  vst1q_u8(*dst, row_6);
+  *dst += stride;
+  vst1q_u8(*dst, row_7);
+  *dst += stride;
 }
 
 void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  int j;
-  uint8x8_t d2u8 = vdup_n_u8(0);
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
+  const uint8x16_t left_u8q = vld1q_u8(left);
   (void)above;
 
-  q1u8 = vld1q_u8(left);
-  d2u8 = vget_low_u8(q1u8);
-  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-    q0u8 = vdupq_lane_u8(d2u8, 0);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 1);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 2);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 3);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 4);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 5);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 6);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 7);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-  }
+  h_store_16x8(&dst, stride, vget_low_u8(left_u8q));
+  h_store_16x8(&dst, stride, vget_high_u8(left_u8q));
+}
+
+static INLINE void h_store_32x8(uint8_t **dst, const ptrdiff_t stride,
+                                const uint8x8_t left) {
+  const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
+  const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
+  const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
+  const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
+  const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
+  const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
+  const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
+  const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
+
+  vst1q_u8(*dst, row_0);  // Note clang-3.8 produced poor code w/vst2q_u8
+  *dst += 16;
+  vst1q_u8(*dst, row_0);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_1);
+  *dst += 16;
+  vst1q_u8(*dst, row_1);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_2);
+  *dst += 16;
+  vst1q_u8(*dst, row_2);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_3);
+  *dst += 16;
+  vst1q_u8(*dst, row_3);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_4);
+  *dst += 16;
+  vst1q_u8(*dst, row_4);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_5);
+  *dst += 16;
+  vst1q_u8(*dst, row_5);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_6);
+  *dst += 16;
+  vst1q_u8(*dst, row_6);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_7);
+  *dst += 16;
+  vst1q_u8(*dst, row_7);
+  *dst += stride - 16;
 }
 
 void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint8x8_t d2u8 = vdup_n_u8(0);
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
+  int i;
   (void)above;
 
-  for (k = 0; k < 2; k++, left += 16) {
-    q1u8 = vld1q_u8(left);
-    d2u8 = vget_low_u8(q1u8);
-    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-      q0u8 = vdupq_lane_u8(d2u8, 0);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 1);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 2);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 3);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 4);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 5);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 6);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 7);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-    }
+  for (i = 0; i < 2; i++, left += 16) {
+    const uint8x16_t left_u8 = vld1q_u8(left);
+    h_store_32x8(&dst, stride, vget_low_u8(left_u8));
+    h_store_32x8(&dst, stride, vget_high_u8(left_u8));
   }
 }
 
+// -----------------------------------------------------------------------------
+
+static INLINE int16x8_t convert_u8_to_s16(uint8x8_t v) {
+  return vreinterpretq_s16_u16(vmovl_u8(v));
+}
+
 void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint16x8_t q1u16, q3u16;
-  int16x8_t q1s16;
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint32x2_t d2u32 = vdup_n_u32(0);
+  const uint8x8_t top_left = vld1_dup_u8(above - 1);
+  const uint8x8_t left_u8 = vld1_u8(left);
+  const uint8x8_t above_u8 = vld1_u8(above);
+  const int16x4_t left_s16 = vget_low_s16(convert_u8_to_s16(left_u8));
+  int16x8_t sub, sum;
+  uint32x2_t d;
 
-  d0u8 = vld1_dup_u8(above - 1);
-  d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
-  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
-  for (i = 0; i < 4; i++, dst += stride) {
-    q1u16 = vdupq_n_u16((uint16_t)left[i]);
-    q1s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
-    d0u8 = vqmovun_s16(q1s16);
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  }
+  sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
+  // Avoid vcombine_s16() which generates lots of redundant code with clang-3.8.
+  sub = vreinterpretq_s16_s64(
+      vdupq_lane_s64(vreinterpret_s64_s16(vget_low_s16(sub)), 0));
+
+  sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
+  sum = vaddq_s16(sum, sub);
+  d = vreinterpret_u32_u8(vqmovun_s16(sum));
+  vst1_lane_u32((uint32_t *)dst, d, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, d, 1);
+  dst += stride;
+
+  sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
+  sum = vaddq_s16(sum, sub);
+  d = vreinterpret_u32_u8(vqmovun_s16(sum));
+  vst1_lane_u32((uint32_t *)dst, d, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, d, 1);
+}
+
+static INLINE void tm_8_kernel(uint8_t **dst, const ptrdiff_t stride,
+                               const int16x8_t left_dup, const int16x8_t sub) {
+  const int16x8_t sum = vaddq_s16(left_dup, sub);
+  const uint8x8_t d = vqmovun_s16(sum);
+  vst1_u8(*dst, d);
+  *dst += stride;
 }
 
 void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  int j;
-  uint16x8_t q0u16, q3u16, q10u16;
-  int16x8_t q0s16;
-  uint16x4_t d20u16;
-  uint8x8_t d0u8, d2u8, d30u8;
+  const uint8x8_t top_left = vld1_dup_u8(above - 1);
+  const uint8x8_t above_u8 = vld1_u8(above);
+  const uint8x8_t left_u8 = vld1_u8(left);
+  const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+  const int16x8_t sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
+  int16x4_t left_s16d = vget_low_s16(left_s16q);
+  int i;
 
-  d0u8 = vld1_dup_u8(above - 1);
-  d30u8 = vld1_u8(left);
-  d2u8 = vld1_u8(above);
-  q10u16 = vmovl_u8(d30u8);
-  q3u16 = vsubl_u8(d2u8, d0u8);
-  d20u16 = vget_low_u16(q10u16);
-  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
-    q0u16 = vdupq_lane_u16(d20u16, 0);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 1);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 2);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 3);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
+  for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+    int16x8_t left_dup;
+
+    left_dup = vdupq_lane_s16(left_s16d, 0);
+    tm_8_kernel(&dst, stride, left_dup, sub);
+    left_dup = vdupq_lane_s16(left_s16d, 1);
+    tm_8_kernel(&dst, stride, left_dup, sub);
+    left_dup = vdupq_lane_s16(left_s16d, 2);
+    tm_8_kernel(&dst, stride, left_dup, sub);
+    left_dup = vdupq_lane_s16(left_s16d, 3);
+    tm_8_kernel(&dst, stride, left_dup, sub);
   }
 }
 
+static INLINE void tm_16_kernel(uint8_t **dst, const ptrdiff_t stride,
+                                const int16x8_t left_dup, const int16x8_t sub0,
+                                const int16x8_t sub1) {
+  const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+  const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+  const uint8x8_t d0 = vqmovun_s16(sum0);
+  const uint8x8_t d1 = vqmovun_s16(sum1);
+  vst1_u8(*dst, d0);
+  *dst += 8;
+  vst1_u8(*dst, d1);
+  *dst += stride - 8;
+}
+
 void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
-  uint8x16_t q0u8, q1u8;
-  int16x8_t q0s16, q1s16, q8s16, q11s16;
-  uint16x4_t d20u16;
-  uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
+  const uint8x16_t top_left = vld1q_dup_u8(above - 1);
+  const uint8x16_t above_u8 = vld1q_u8(above);
+  const int16x8_t sub0 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_low_u8(above_u8), vget_low_u8(top_left)));
+  const int16x8_t sub1 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_high_u8(above_u8), vget_high_u8(top_left)));
+  int16x8_t left_dup;
+  int i;
 
-  q0u8 = vld1q_dup_u8(above - 1);
-  q1u8 = vld1q_u8(above);
-  q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
-  q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
-  for (k = 0; k < 2; k++, left += 8) {
-    d18u8 = vld1_u8(left);
-    q10u16 = vmovl_u8(d18u8);
-    d20u16 = vget_low_u16(q10u16);
-    for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
-      q0u16 = vdupq_lane_u16(d20u16, 0);
-      q8u16 = vdupq_lane_u16(d20u16, 1);
-      q1s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
-      q0s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
-      q11s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
-      q8s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
-      d2u8 = vqmovun_s16(q1s16);
-      d3u8 = vqmovun_s16(q0s16);
-      d22u8 = vqmovun_s16(q11s16);
-      d23u8 = vqmovun_s16(q8s16);
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
-      dst += stride;
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
-      dst += stride;
+  for (i = 0; i < 2; i++, left += 8) {
+    const uint8x8_t left_u8 = vld1_u8(left);
+    const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+    const int16x4_t left_low = vget_low_s16(left_s16q);
+    const int16x4_t left_high = vget_high_s16(left_s16q);
 
-      q0u16 = vdupq_lane_u16(d20u16, 2);
-      q8u16 = vdupq_lane_u16(d20u16, 3);
-      q1s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
-      q0s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
-      q11s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
-      q8s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
-      d2u8 = vqmovun_s16(q1s16);
-      d3u8 = vqmovun_s16(q0s16);
-      d22u8 = vqmovun_s16(q11s16);
-      d23u8 = vqmovun_s16(q8s16);
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
-      dst += stride;
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
-      dst += stride;
-    }
+    left_dup = vdupq_lane_s16(left_low, 0);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_low, 1);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_low, 2);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_low, 3);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+
+    left_dup = vdupq_lane_s16(left_high, 0);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_high, 1);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_high, 2);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_high, 3);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
   }
 }
 
+static INLINE void tm_32_kernel(uint8_t **dst, const ptrdiff_t stride,
+                                const int16x8_t left_dup, const int16x8_t sub0,
+                                const int16x8_t sub1, const int16x8_t sub2,
+                                const int16x8_t sub3) {
+  const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+  const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+  const int16x8_t sum2 = vaddq_s16(left_dup, sub2);
+  const int16x8_t sum3 = vaddq_s16(left_dup, sub3);
+  const uint8x8_t d0 = vqmovun_s16(sum0);
+  const uint8x8_t d1 = vqmovun_s16(sum1);
+  const uint8x8_t d2 = vqmovun_s16(sum2);
+  const uint8x8_t d3 = vqmovun_s16(sum3);
+
+  vst1q_u8(*dst, vcombine_u8(d0, d1));
+  *dst += 16;
+  vst1q_u8(*dst, vcombine_u8(d2, d3));
+  *dst += stride - 16;
+}
+
 void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
-  uint8x16_t q0u8, q1u8, q2u8;
-  int16x8_t q12s16, q13s16, q14s16, q15s16;
-  uint16x4_t d6u16;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
+  const uint8x16_t top_left = vld1q_dup_u8(above - 1);
+  const uint8x16_t above_low = vld1q_u8(above);
+  const uint8x16_t above_high = vld1q_u8(above + 16);
+  const int16x8_t sub0 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_low_u8(above_low), vget_low_u8(top_left)));
+  const int16x8_t sub1 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_high_u8(above_low), vget_high_u8(top_left)));
+  const int16x8_t sub2 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_low_u8(above_high), vget_low_u8(top_left)));
+  const int16x8_t sub3 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_high_u8(above_high), vget_high_u8(top_left)));
+  int16x8_t left_dup;
+  int i, j;
 
-  q0u8 = vld1q_dup_u8(above - 1);
-  q1u8 = vld1q_u8(above);
-  q2u8 = vld1q_u8(above + 16);
-  q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
-  q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
-  q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
-  q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
-  for (k = 0; k < 4; k++, left += 8) {
-    d26u8 = vld1_u8(left);
-    q3u16 = vmovl_u8(d26u8);
-    d6u16 = vget_low_u16(q3u16);
-    for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
-      q0u16 = vdupq_lane_u16(d6u16, 0);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 1);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 2);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 3);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
+  for (j = 0; j < 4; j++, left += 8) {
+    const uint8x8_t left_u8 = vld1_u8(left);
+    const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+    int16x4_t left_s16d = vget_low_s16(left_s16q);
+    for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+      left_dup = vdupq_lane_s16(left_s16d, 0);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+      left_dup = vdupq_lane_s16(left_s16d, 1);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+      left_dup = vdupq_lane_s16(left_s16d, 2);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+      left_dup = vdupq_lane_s16(left_s16d, 3);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
     }
   }
 }
diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm b/libs/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm
index 5a8fdd6aff..730c40de0e 100644
--- a/libs/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm
+++ b/libs/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm
@@ -8,192 +8,659 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vpx_lpf_horizontal_4_dual_neon|
+    EXPORT  |vpx_lpf_horizontal_16_neon|
+    EXPORT  |vpx_lpf_horizontal_16_dual_neon|
+    EXPORT  |vpx_lpf_vertical_16_neon|
+    EXPORT  |vpx_lpf_vertical_16_dual_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
-;                                    const uint8_t *blimit0,
-;                                    const uint8_t *limit0,
-;                                    const uint8_t *thresh0,
-;                                    const uint8_t *blimit1,
-;                                    const uint8_t *limit1,
-;                                    const uint8_t *thresh1)
+; void mb_lpf_horizontal_edge(uint8_t *s, int p,
+;                             const uint8_t *blimit,
+;                             const uint8_t *limit,
+;                             const uint8_t *thresh,
+;                             int count)
 ; r0    uint8_t *s,
-; r1    int p,
-; r2    const uint8_t *blimit0,
-; r3    const uint8_t *limit0,
-; sp    const uint8_t *thresh0,
-; sp+4  const uint8_t *blimit1,
-; sp+8  const uint8_t *limit1,
-; sp+12 const uint8_t *thresh1,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; r12   int count
+|mb_lpf_horizontal_edge| PROC
+    push        {r4-r8, lr}
+    vpush       {d8-d15}
+    ldr         r4, [sp, #88]              ; load thresh
 
-|vpx_lpf_horizontal_4_dual_neon| PROC
-    push        {lr}
+h_count
+    vld1.8      {d16[]}, [r2]              ; load *blimit
+    vld1.8      {d17[]}, [r3]              ; load *limit
+    vld1.8      {d18[]}, [r4]              ; load *thresh
 
-    ldr         r12, [sp, #4]              ; load thresh0
-    vld1.8      {d0}, [r2]                 ; load blimit0 to first half q
-    vld1.8      {d2}, [r3]                 ; load limit0 to first half q
+    sub         r8, r0, r1, lsl #3         ; move src pointer down by 8 lines
 
-    add         r1, r1, r1                 ; double pitch
-    ldr         r2, [sp, #8]               ; load blimit1
+    vld1.u8     {d0}, [r8@64], r1          ; p7
+    vld1.u8     {d1}, [r8@64], r1          ; p6
+    vld1.u8     {d2}, [r8@64], r1          ; p5
+    vld1.u8     {d3}, [r8@64], r1          ; p4
+    vld1.u8     {d4}, [r8@64], r1          ; p3
+    vld1.u8     {d5}, [r8@64], r1          ; p2
+    vld1.u8     {d6}, [r8@64], r1          ; p1
+    vld1.u8     {d7}, [r8@64], r1          ; p0
+    vld1.u8     {d8}, [r8@64], r1          ; q0
+    vld1.u8     {d9}, [r8@64], r1          ; q1
+    vld1.u8     {d10}, [r8@64], r1         ; q2
+    vld1.u8     {d11}, [r8@64], r1         ; q3
+    vld1.u8     {d12}, [r8@64], r1         ; q4
+    vld1.u8     {d13}, [r8@64], r1         ; q5
+    vld1.u8     {d14}, [r8@64], r1         ; q6
+    vld1.u8     {d15}, [r8@64], r1         ; q7
 
-    vld1.8      {d4}, [r12]                ; load thresh0 to first half q
+    bl          vpx_wide_mbfilter_neon
 
-    ldr         r3, [sp, #12]              ; load limit1
-    ldr         r12, [sp, #16]             ; load thresh1
-    vld1.8      {d1}, [r2]                 ; load blimit1 to 2nd half q
+    tst         r7, #1
+    beq         h_mbfilter
 
-    sub         r2, r0, r1, lsl #1         ; s[-4 * p]
+    ; flat && mask were not set for any of the channels. Just store the values
+    ; from filter.
+    sub         r8, r0, r1, lsl #1
 
-    vld1.8      {d3}, [r3]                 ; load limit1 to 2nd half q
-    vld1.8      {d5}, [r12]                ; load thresh1 to 2nd half q
+    vst1.u8     {d25}, [r8@64], r1         ; store op1
+    vst1.u8     {d24}, [r8@64], r1         ; store op0
+    vst1.u8     {d23}, [r8@64], r1         ; store oq0
+    vst1.u8     {d26}, [r8@64], r1         ; store oq1
 
-    vpush       {d8-d15}                   ; save neon registers
+    b           h_next
 
-    add         r3, r2, r1, lsr #1         ; s[-3 * p]
+h_mbfilter
+    tst         r7, #2
+    beq         h_wide_mbfilter
 
-    vld1.u8     {q3}, [r2@64], r1          ; p3
-    vld1.u8     {q4}, [r3@64], r1          ; p2
-    vld1.u8     {q5}, [r2@64], r1          ; p1
-    vld1.u8     {q6}, [r3@64], r1          ; p0
-    vld1.u8     {q7}, [r2@64], r1          ; q0
-    vld1.u8     {q8}, [r3@64], r1          ; q1
-    vld1.u8     {q9}, [r2@64]              ; q2
-    vld1.u8     {q10}, [r3@64]             ; q3
+    ; flat2 was not set for any of the channels. Just store the values from
+    ; mbfilter.
+    sub         r8, r0, r1, lsl #1
+    sub         r8, r8, r1
 
-    sub         r2, r2, r1, lsl #1
-    sub         r3, r3, r1, lsl #1
+    vst1.u8     {d18}, [r8@64], r1         ; store op2
+    vst1.u8     {d19}, [r8@64], r1         ; store op1
+    vst1.u8     {d20}, [r8@64], r1         ; store op0
+    vst1.u8     {d21}, [r8@64], r1         ; store oq0
+    vst1.u8     {d22}, [r8@64], r1         ; store oq1
+    vst1.u8     {d23}, [r8@64], r1         ; store oq2
 
-    bl          vpx_loop_filter_neon_16
+    b           h_next
 
-    vst1.u8     {q5}, [r2@64], r1          ; store op1
-    vst1.u8     {q6}, [r3@64], r1          ; store op0
-    vst1.u8     {q7}, [r2@64], r1          ; store oq0
-    vst1.u8     {q8}, [r3@64], r1          ; store oq1
+h_wide_mbfilter
+    sub         r8, r0, r1, lsl #3
+    add         r8, r8, r1
 
-    vpop        {d8-d15}                   ; restore neon registers
+    vst1.u8     {d16}, [r8@64], r1         ; store op6
+    vst1.u8     {d24}, [r8@64], r1         ; store op5
+    vst1.u8     {d25}, [r8@64], r1         ; store op4
+    vst1.u8     {d26}, [r8@64], r1         ; store op3
+    vst1.u8     {d27}, [r8@64], r1         ; store op2
+    vst1.u8     {d18}, [r8@64], r1         ; store op1
+    vst1.u8     {d19}, [r8@64], r1         ; store op0
+    vst1.u8     {d20}, [r8@64], r1         ; store oq0
+    vst1.u8     {d21}, [r8@64], r1         ; store oq1
+    vst1.u8     {d22}, [r8@64], r1         ; store oq2
+    vst1.u8     {d23}, [r8@64], r1         ; store oq3
+    vst1.u8     {d1}, [r8@64], r1          ; store oq4
+    vst1.u8     {d2}, [r8@64], r1          ; store oq5
+    vst1.u8     {d3}, [r8@64], r1          ; store oq6
 
-    pop         {pc}
-    ENDP        ; |vpx_lpf_horizontal_4_dual_neon|
+h_next
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         h_count
 
-; void vpx_loop_filter_neon_16();
+    vpop        {d8-d15}
+    pop         {r4-r8, pc}
+
+    ENDP        ; |mb_lpf_horizontal_edge|
+
+; void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch,
+;                                     const uint8_t *blimit,
+;                                     const uint8_t *limit,
+;                                     const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int pitch,
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh
+|vpx_lpf_horizontal_16_neon| PROC
+    mov r12, #1
+    b mb_lpf_horizontal_edge
+    ENDP        ; |vpx_lpf_horizontal_16_neon|
+
+; void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch,
+;                                      const uint8_t *blimit,
+;                                      const uint8_t *limit,
+;                                      const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int pitch,
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh
+|vpx_lpf_horizontal_16_dual_neon| PROC
+    mov r12, #2
+    b mb_lpf_horizontal_edge
+    ENDP        ; |vpx_lpf_horizontal_16_dual_neon|
+
+; void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+;                             const uint8_t *limit, const uint8_t *thresh,
+;                             int count) {
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; r12   int count
+|mb_lpf_vertical_edge_w| PROC
+    push        {r4-r8, lr}
+    vpush       {d8-d15}
+    ldr         r4, [sp, #88]              ; load thresh
+
+v_count
+    vld1.8      {d16[]}, [r2]              ; load *blimit
+    vld1.8      {d17[]}, [r3]              ; load *limit
+    vld1.8      {d18[]}, [r4]              ; load *thresh
+
+    sub         r8, r0, #8
+
+    vld1.8      {d0}, [r8@64], r1
+    vld1.8      {d8}, [r0@64], r1
+    vld1.8      {d1}, [r8@64], r1
+    vld1.8      {d9}, [r0@64], r1
+    vld1.8      {d2}, [r8@64], r1
+    vld1.8      {d10}, [r0@64], r1
+    vld1.8      {d3}, [r8@64], r1
+    vld1.8      {d11}, [r0@64], r1
+    vld1.8      {d4}, [r8@64], r1
+    vld1.8      {d12}, [r0@64], r1
+    vld1.8      {d5}, [r8@64], r1
+    vld1.8      {d13}, [r0@64], r1
+    vld1.8      {d6}, [r8@64], r1
+    vld1.8      {d14}, [r0@64], r1
+    vld1.8      {d7}, [r8@64], r1
+    vld1.8      {d15}, [r0@64], r1
+
+    sub         r0, r0, r1, lsl #3
+
+    vtrn.32     q0, q2
+    vtrn.32     q1, q3
+    vtrn.32     q4, q6
+    vtrn.32     q5, q7
+
+    vtrn.16     q0, q1
+    vtrn.16     q2, q3
+    vtrn.16     q4, q5
+    vtrn.16     q6, q7
+
+    vtrn.8      d0, d1
+    vtrn.8      d2, d3
+    vtrn.8      d4, d5
+    vtrn.8      d6, d7
+
+    vtrn.8      d8, d9
+    vtrn.8      d10, d11
+    vtrn.8      d12, d13
+    vtrn.8      d14, d15
+
+    bl          vpx_wide_mbfilter_neon
+
+    tst         r7, #1
+    beq         v_mbfilter
+
+    ; flat && mask were not set for any of the channels. Just store the values
+    ; from filter.
+    sub         r0, #2
+
+    vswp        d23, d25
+
+    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r0], r1
+    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r0], r1
+    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r0], r1
+    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r0], r1
+    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r0], r1
+    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r0], r1
+    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r0], r1
+    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r0], r1
+    add         r0, #2
+
+    b           v_next
+
+v_mbfilter
+    tst         r7, #2
+    beq         v_wide_mbfilter
+
+    ; flat2 was not set for any of the channels. Just store the values from
+    ; mbfilter.
+    sub         r8, r0, #3
+
+    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
+    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
+    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
+    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
+    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
+    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
+    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
+    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
+    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
+    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
+    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
+    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
+    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
+    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
+    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
+    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
+
+    b           v_next
+
+v_wide_mbfilter
+    sub         r8, r0, #8
+
+    vtrn.32     d0,  d26
+    vtrn.32     d16, d27
+    vtrn.32     d24, d18
+    vtrn.32     d25, d19
+
+    vtrn.16     d0,  d24
+    vtrn.16     d16, d25
+    vtrn.16     d26, d18
+    vtrn.16     d27, d19
+
+    vtrn.8      d0,  d16
+    vtrn.8      d24, d25
+    vtrn.8      d26, d27
+    vtrn.8      d18, d19
+
+    vtrn.32     d20, d1
+    vtrn.32     d21, d2
+    vtrn.32     d22, d3
+    vtrn.32     d23, d15
+
+    vtrn.16     d20, d22
+    vtrn.16     d21, d23
+    vtrn.16     d1,  d3
+    vtrn.16     d2,  d15
+
+    vtrn.8      d20, d21
+    vtrn.8      d22, d23
+    vtrn.8      d1,  d2
+    vtrn.8      d3,  d15
+
+    vst1.8      {d0}, [r8@64], r1
+    vst1.8      {d20}, [r0@64], r1
+    vst1.8      {d16}, [r8@64], r1
+    vst1.8      {d21}, [r0@64], r1
+    vst1.8      {d24}, [r8@64], r1
+    vst1.8      {d22}, [r0@64], r1
+    vst1.8      {d25}, [r8@64], r1
+    vst1.8      {d23}, [r0@64], r1
+    vst1.8      {d26}, [r8@64], r1
+    vst1.8      {d1}, [r0@64], r1
+    vst1.8      {d27}, [r8@64], r1
+    vst1.8      {d2}, [r0@64], r1
+    vst1.8      {d18}, [r8@64], r1
+    vst1.8      {d3}, [r0@64], r1
+    vst1.8      {d19}, [r8@64], r1
+    vst1.8      {d15}, [r0@64], r1
+
+v_next
+    subs        r12, #1
+    bne         v_count
+
+    vpop        {d8-d15}
+    pop         {r4-r8, pc}
+
+    ENDP        ; |mb_lpf_vertical_edge_w|
+
+; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+;                               const uint8_t *limit, const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh
+|vpx_lpf_vertical_16_neon| PROC
+    mov r12, #1
+    b mb_lpf_vertical_edge_w
+    ENDP        ; |vpx_lpf_vertical_16_neon|
+
+; void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+;                                    const uint8_t *limit,
+;                                    const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh
+|vpx_lpf_vertical_16_dual_neon| PROC
+    mov r12, #2
+    b mb_lpf_vertical_edge_w
+    ENDP        ; |vpx_lpf_vertical_16_dual_neon|
+
+; void vpx_wide_mbfilter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. This function uses
-; registers d8-d15, so the calling function must save those registers.
+; necessary load, transpose (if necessary) and store.
 ;
-; r0-r3, r12 PRESERVE
-; q0    blimit
-; q1    limit
-; q2    thresh
-; q3    p3
-; q4    p2
-; q5    p1
-; q6    p0
-; q7    q0
-; q8    q1
-; q9    q2
-; q10   q3
-;
-; Outputs:
-; q5    op1
-; q6    op0
-; q7    oq0
-; q8    oq1
-|vpx_loop_filter_neon_16| PROC
+; r0-r3 PRESERVE
+; d16    blimit
+; d17    limit
+; d18    thresh
+; d0    p7
+; d1    p6
+; d2    p5
+; d3    p4
+; d4    p3
+; d5    p2
+; d6    p1
+; d7    p0
+; d8    q0
+; d9    q1
+; d10   q2
+; d11   q3
+; d12   q4
+; d13   q5
+; d14   q6
+; d15   q7
+|vpx_wide_mbfilter_neon| PROC
+    mov         r7, #0
 
     ; filter_mask
-    vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; m2 = abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; m3 = abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; m4 = abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; m5 = abs(q2 - q1)
-    vabd.u8     q4, q10, q9                 ; m6 = abs(q3 - q2)
+    vabd.u8     d19, d4, d5                ; abs(p3 - p2)
+    vabd.u8     d20, d5, d6                ; abs(p2 - p1)
+    vabd.u8     d21, d6, d7                ; abs(p1 - p0)
+    vabd.u8     d22, d9, d8                ; abs(q1 - q0)
+    vabd.u8     d23, d10, d9               ; abs(q2 - q1)
+    vabd.u8     d24, d11, d10              ; abs(q3 - q2)
 
     ; only compare the largest value to limit
-    vmax.u8     q11, q11, q12               ; m7 = max(m1, m2)
-    vmax.u8     q12, q13, q14               ; m8 = max(m3, m4)
+    vmax.u8     d19, d19, d20              ; max(abs(p3 - p2), abs(p2 - p1))
+    vmax.u8     d20, d21, d22              ; max(abs(p1 - p0), abs(q1 - q0))
+    vmax.u8     d23, d23, d24              ; max(abs(q2 - q1), abs(q3 - q2))
+    vmax.u8     d19, d19, d20
 
-    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+    vabd.u8     d24, d7, d8                ; abs(p0 - q0)
 
-    vmax.u8     q3, q3, q4                  ; m9 = max(m5, m6)
+    vmax.u8     d19, d19, d23
 
-    vmov.u8     q10, #0x80
+    vabd.u8     d23, d6, d9                ; a = abs(p1 - q1)
+    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
 
-    vmax.u8     q15, q11, q12               ; m10 = max(m7, m8)
+    ; abs () > limit
+    vcge.u8     d19, d17, d19
 
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     q15, q15, q3                ; m11 = max(m10, m9)
+    ; flatmask4
+    vabd.u8     d25, d7, d5                ; abs(p0 - p2)
+    vabd.u8     d26, d8, d10               ; abs(q0 - q2)
+    vabd.u8     d27, d4, d7                ; abs(p3 - p0)
+    vabd.u8     d28, d11, d8               ; abs(q3 - q0)
 
-    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
-    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
+    ; only compare the largest value to thresh
+    vmax.u8     d25, d25, d26              ; max(abs(p0 - p2), abs(q0 - q2))
+    vmax.u8     d26, d27, d28              ; max(abs(p3 - p0), abs(q3 - q0))
+    vmax.u8     d25, d25, d26
+    vmax.u8     d20, d20, d25
 
-    veor        q7, q7, q10                 ; qs0
+    vshr.u8     d23, d23, #1               ; a = a / 2
+    vqadd.u8    d24, d24, d23              ; a = b + a
 
-    vcge.u8     q15, q1, q15                ; abs(m11) > limit
+    vmov.u8     d30, #1
+    vcge.u8     d24, d16, d24              ; (a > blimit * 2 + limit) * -1
 
-    vshr.u8     q2, q2, #1                  ; a = a / 2
-    veor        q6, q6, q10                 ; ps0
+    vcge.u8     d20, d30, d20              ; flat
 
-    veor        q5, q5, q10                 ; ps1
-    vqadd.u8    q9, q9, q2                  ; a = b + a
+    vand        d19, d19, d24              ; mask
 
-    veor        q8, q8, q10                 ; qs1
+    ; hevmask
+    vcgt.u8     d21, d21, d18              ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d18              ; (abs(q1 - q0) > thresh)*-1
+    vorr        d21, d21, d22              ; hev
 
-    vmov.u16    q4, #3
+    vand        d16, d20, d19              ; flat && mask
+    vmov        r5, r6, d16
 
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q11, d15, d13
+    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
+    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
+    vabd.u8     d23, d12, d8               ; abs(q4 - q0)
+    vabd.u8     d24, d7, d2                ; abs(p0 - p5)
+    vabd.u8     d25, d8, d13               ; abs(q0 - q5)
+    vabd.u8     d26, d1, d7                ; abs(p6 - p0)
+    vabd.u8     d27, d14, d8               ; abs(q6 - q0)
+    vabd.u8     d28, d0, d7                ; abs(p7 - p0)
+    vabd.u8     d29, d15, d8               ; abs(q7 - q0)
 
-    vcge.u8     q9, q0, q9                  ; a > blimit
+    ; only compare the largest value to thresh
+    vmax.u8     d22, d22, d23              ; max(abs(p4 - p0), abs(q4 - q0))
+    vmax.u8     d23, d24, d25              ; max(abs(p0 - p5), abs(q0 - q5))
+    vmax.u8     d24, d26, d27              ; max(abs(p6 - p0), abs(q6 - q0))
+    vmax.u8     d25, d28, d29              ; max(abs(p7 - p0), abs(q7 - q0))
 
-    vqsub.s8    q1, q5, q8                  ; filter = clamp(ps1-qs1)
-    vorr        q14, q13, q14               ; hev
+    vmax.u8     d26, d22, d23
+    vmax.u8     d27, d24, d25
+    vmax.u8     d23, d26, d27
 
-    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
-    vmul.i16    q11, q11, q4
+    vcge.u8     d18, d30, d23              ; flat2
 
-    vand        q1, q1, q14                 ; filter &= hev
-    vand        q15, q15, q9                ; mask
+    vmov.u8     d22, #0x80
 
-    vmov.u8     q4, #3
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #1                 ; Only do filter branch
 
-    vaddw.s8    q2, q2, d2                  ; filter + 3 * (qs0 - ps0)
-    vaddw.s8    q11, q11, d3
+    vand        d17, d18, d16              ; flat2 && flat && mask
+    vmov        r5, r6, d17
 
-    vmov.u8     q9, #4
+    ; mbfilter() function
+
+    ; filter() function
+    ; convert to signed
+    veor        d23, d8, d22               ; qs0
+    veor        d24, d7, d22               ; ps0
+    veor        d25, d6, d22               ; ps1
+    veor        d26, d9, d22               ; qs1
+
+    vmov.u8     d27, #3
+
+    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
+    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
+    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
+    vand        d29, d29, d21              ; filter &= hev
+    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
+    vmov.u8     d29, #4
 
     ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d2, q2
-    vqmovn.s16  d3, q11
-    vand        q1, q1, q15                 ; filter &= mask
+    vqmovn.s16  d28, q15
 
-    vqadd.s8    q2, q1, q4                  ; filter2 = clamp(filter+3)
-    vqadd.s8    q1, q1, q9                  ; filter1 = clamp(filter+4)
-    vshr.s8     q2, q2, #3                  ; filter2 >>= 3
-    vshr.s8     q1, q1, #3                  ; filter1 >>= 3
+    vand        d28, d28, d19              ; filter &= mask
+
+    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
+    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
+    vshr.s8     d30, d30, #3               ; filter2 >>= 3
+    vshr.s8     d29, d29, #3               ; filter1 >>= 3
 
 
-    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + filter2)
-    vqsub.s8    q0, q7, q1                  ; u = clamp(qs0 - filter1)
+    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
+    vqsub.s8    d23, d23, d29              ; oq0 = clamp(qs0 - filter1)
 
-    ; outer tap adjustments
-    vrshr.s8    q1, q1, #1                  ; filter = ++filter1 >> 1
+    ; outer tap adjustments: ++filter1 >> 1
+    vrshr.s8    d29, d29, #1
+    vbic        d29, d29, d21              ; filter &= ~hev
 
-    veor        q7, q0,  q10                ; *oq0 = u^0x80
+    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
+    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
 
-    vbic        q1, q1, q14                 ; filter &= ~hev
+    veor        d24, d24, d22              ; *f_op0 = u^0x80
+    veor        d23, d23, d22              ; *f_oq0 = u^0x80
+    veor        d25, d25, d22              ; *f_op1 = u^0x80
+    veor        d26, d26, d22              ; *f_oq1 = u^0x80
 
-    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + filter)
-    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - filter)
+    tst         r7, #1
+    bxne        lr
 
-    veor        q6, q11, q10                ; *op0 = u^0x80
-    veor        q5, q13, q10                ; *op1 = u^0x80
-    veor        q8, q12, q10                ; *oq1 = u^0x80
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #2                 ; Only do mbfilter branch
+
+    ; mbfilter flat && mask branch
+    ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
+    ; and using vibt on the q's?
+    vmov.u8     d29, #2
+    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
+    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
+    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
+    vaddl.u8    q10, d4, d5
+    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
+    vaddl.u8    q14, d6, d9
+    vqrshrn.u16 d18, q15, #3               ; r_op2
+
+    vsub.i16    q15, q10
+    vaddl.u8    q10, d4, d6
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d7, d10
+    vqrshrn.u16 d19, q15, #3               ; r_op1
+
+    vsub.i16    q15, q10
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d8, d11
+    vqrshrn.u16 d20, q15, #3               ; r_op0
+
+    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
+    vsubw.u8    q15, d7                    ; oq0 -= p0
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d9, d11
+    vqrshrn.u16 d21, q15, #3               ; r_oq0
+
+    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
+    vsubw.u8    q15, d8                    ; oq1 -= q0
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d10, d11
+    vqrshrn.u16 d22, q15, #3               ; r_oq1
+
+    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
+    vsubw.u8    q15, d9                    ; oq2 -= q1
+    vadd.i16    q15, q14
+    vqrshrn.u16 d27, q15, #3               ; r_oq2
+
+    ; Filter does not set op2 or oq2, so use p2 and q2.
+    vbif        d18, d5, d16               ; t_op2 |= p2 & ~(flat & mask)
+    vbif        d19, d25, d16              ; t_op1 |= f_op1 & ~(flat & mask)
+    vbif        d20, d24, d16              ; t_op0 |= f_op0 & ~(flat & mask)
+    vbif        d21, d23, d16              ; t_oq0 |= f_oq0 & ~(flat & mask)
+    vbif        d22, d26, d16              ; t_oq1 |= f_oq1 & ~(flat & mask)
+
+    vbit        d23, d27, d16              ; t_oq2 |= r_oq2 & (flat & mask)
+    vbif        d23, d10, d16              ; t_oq2 |= q2 & ~(flat & mask)
+
+    tst         r7, #2
+    bxne        lr
+
+    ; wide_mbfilter flat2 && flat && mask branch
+    vmov.u8     d16, #7
+    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
+    vaddl.u8    q12, d2, d3
+    vaddl.u8    q13, d4, d5
+    vaddl.u8    q14, d1, d6
+    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
+    vadd.i16    q12, q13
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d2, d9
+    vadd.i16    q15, q12
+    vaddl.u8    q12, d0, d1
+    vaddw.u8    q15, d1
+    vaddl.u8    q13, d0, d2
+    vadd.i16    q14, q15, q14
+    vqrshrn.u16 d16, q15, #4               ; w_op6
+
+    vsub.i16    q15, q14, q12
+    vaddl.u8    q14, d3, d10
+    vqrshrn.u16 d24, q15, #4               ; w_op5
+
+    vsub.i16    q15, q13
+    vaddl.u8    q13, d0, d3
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d4, d11
+    vqrshrn.u16 d25, q15, #4               ; w_op4
+
+    vadd.i16    q15, q14
+    vaddl.u8    q14, d0, d4
+    vsub.i16    q15, q13
+    vsub.i16    q14, q15, q14
+    vqrshrn.u16 d26, q15, #4               ; w_op3
+
+    vaddw.u8    q15, q14, d5               ; op2 += p2
+    vaddl.u8    q14, d0, d5
+    vaddw.u8    q15, d12                   ; op2 += q4
+    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
+    vqrshrn.u16 d27, q15, #4               ; w_op2
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d0, d6
+    vaddw.u8    q15, d6                    ; op1 += p1
+    vaddw.u8    q15, d13                   ; op1 += q5
+    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
+    vqrshrn.u16 d18, q15, #4               ; w_op1
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d0, d7
+    vaddw.u8    q15, d7                    ; op0 += p0
+    vaddw.u8    q15, d14                   ; op0 += q6
+    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
+    vqrshrn.u16 d19, q15, #4               ; w_op0
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d1, d8
+    vaddw.u8    q15, d8                    ; oq0 += q0
+    vaddw.u8    q15, d15                   ; oq0 += q7
+    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
+    vqrshrn.u16 d20, q15, #4               ; w_oq0
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d2, d9
+    vaddw.u8    q15, d9                    ; oq1 += q1
+    vaddl.u8    q4, d10, d15
+    vaddw.u8    q15, d15                   ; oq1 += q7
+    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
+    vqrshrn.u16 d21, q15, #4               ; w_oq1
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d3, d10
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d11, d15
+    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
+    vqrshrn.u16 d22, q15, #4               ; w_oq2
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d4, d11
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d12, d15
+    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
+    vqrshrn.u16 d23, q15, #4               ; w_oq3
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d5, d12
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d13, d15
+    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
+    vqrshrn.u16 d1, q15, #4                ; w_oq4
+
+    vsub.i16    q15, q14
+    vaddl.u8    q14, d6, d13
+    vadd.i16    q15, q4
+    vaddl.u8    q4, d14, d15
+    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
+    vqrshrn.u16 d2, q15, #4                ; w_oq5
+
+    vsub.i16    q15, q14
+    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
+    vadd.i16    q15, q4
+    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
+    vqrshrn.u16 d3, q15, #4                ; w_oq6
+    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
+    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
+    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
 
     bx          lr
-    ENDP        ; |vpx_loop_filter_neon_16|
+    ENDP        ; |vpx_wide_mbfilter_neon|
 
     END
diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm b/libs/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
index 9371158984..907e918380 100644
--- a/libs/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
+++ b/libs/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm
@@ -10,6 +10,8 @@
 
     EXPORT  |vpx_lpf_horizontal_4_neon|
     EXPORT  |vpx_lpf_vertical_4_neon|
+    EXPORT  |vpx_lpf_horizontal_4_dual_neon|
+    EXPORT  |vpx_lpf_vertical_4_dual_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
@@ -53,7 +55,7 @@
     sub         r2, r2, r1, lsl #1
     sub         r3, r3, r1, lsl #1
 
-    bl          vpx_loop_filter_neon
+    bl          filter4_8
 
     vst1.u8     {d4}, [r2@64], r1          ; store op1
     vst1.u8     {d5}, [r3@64], r1          ; store op0
@@ -113,7 +115,7 @@
     vtrn.8      d7, d16
     vtrn.8      d17, d18
 
-    bl          vpx_loop_filter_neon
+    bl          filter4_8
 
     sub         r0, r0, #2
 
@@ -130,7 +132,7 @@
     pop         {pc}
     ENDP        ; |vpx_lpf_vertical_4_neon|
 
-; void vpx_loop_filter_neon();
+; void filter4_8();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
 ; registers d8-d15.
@@ -154,7 +156,7 @@
 ; d5    op0
 ; d6    oq0
 ; d7    oq1
-|vpx_loop_filter_neon| PROC
+|filter4_8| PROC
     ; filter_mask
     vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
     vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
@@ -244,6 +246,304 @@
     veor        d7, d20, d18                ; *oq1 = u^0x80
 
     bx          lr
-    ENDP        ; |vpx_loop_filter_neon|
+    ENDP        ; |filter4_8|
+
+;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
+;                                    const uint8_t *blimit0,
+;                                    const uint8_t *limit0,
+;                                    const uint8_t *thresh0,
+;                                    const uint8_t *blimit1,
+;                                    const uint8_t *limit1,
+;                                    const uint8_t *thresh1)
+; r0    uint8_t *s,
+; r1    int p,
+; r2    const uint8_t *blimit0,
+; r3    const uint8_t *limit0,
+; sp    const uint8_t *thresh0,
+; sp+4  const uint8_t *blimit1,
+; sp+8  const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|vpx_lpf_horizontal_4_dual_neon| PROC
+    push        {lr}
+
+    ldr         r12, [sp, #4]              ; load thresh0
+    vld1.8      {d0}, [r2]                 ; load blimit0 to first half q
+    vld1.8      {d2}, [r3]                 ; load limit0 to first half q
+
+    add         r1, r1, r1                 ; double pitch
+    ldr         r2, [sp, #8]               ; load blimit1
+
+    vld1.8      {d4}, [r12]                ; load thresh0 to first half q
+
+    ldr         r3, [sp, #12]              ; load limit1
+    ldr         r12, [sp, #16]             ; load thresh1
+    vld1.8      {d1}, [r2]                 ; load blimit1 to 2nd half q
+
+    sub         r2, r0, r1, lsl #1         ; s[-4 * p]
+
+    vld1.8      {d3}, [r3]                 ; load limit1 to 2nd half q
+    vld1.8      {d5}, [r12]                ; load thresh1 to 2nd half q
+
+    vpush       {d8-d15}                   ; save neon registers
+
+    add         r3, r2, r1, lsr #1         ; s[-3 * p]
+
+    vld1.u8     {q3}, [r2@64], r1          ; p3
+    vld1.u8     {q4}, [r3@64], r1          ; p2
+    vld1.u8     {q5}, [r2@64], r1          ; p1
+    vld1.u8     {q6}, [r3@64], r1          ; p0
+    vld1.u8     {q7}, [r2@64], r1          ; q0
+    vld1.u8     {q8}, [r3@64], r1          ; q1
+    vld1.u8     {q9}, [r2@64]              ; q2
+    vld1.u8     {q10}, [r3@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r3, r3, r1, lsl #1
+
+    bl          filter4_16
+
+    vst1.u8     {q5}, [r2@64], r1          ; store op1
+    vst1.u8     {q6}, [r3@64], r1          ; store op0
+    vst1.u8     {q7}, [r2@64], r1          ; store oq0
+    vst1.u8     {q8}, [r3@64], r1          ; store oq1
+
+    vpop        {d8-d15}                   ; restore neon registers
+
+    pop         {pc}
+    ENDP        ; |vpx_lpf_horizontal_4_dual_neon|
+
+;void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+;                                  const uint8_t *blimit0,
+;                                  const uint8_t *limit0,
+;                                  const uint8_t *thresh0,
+;                                  const uint8_t *blimit1,
+;                                  const uint8_t *limit1,
+;                                  const uint8_t *thresh1)
+; r0    uint8_t *s,
+; r1    int p,
+; r2    const uint8_t *blimit0,
+; r3    const uint8_t *limit0,
+; sp    const uint8_t *thresh0,
+; sp+4  const uint8_t *blimit1,
+; sp+8  const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|vpx_lpf_vertical_4_dual_neon| PROC
+    push        {lr}
+
+    ldr         r12, [sp, #4]              ; load thresh0
+    vld1.8      {d0}, [r2]                 ; load blimit0 to first half q
+    vld1.8      {d2}, [r3]                 ; load limit0 to first half q
+
+    ldr         r2, [sp, #8]               ; load blimit1
+
+    vld1.8      {d4}, [r12]                ; load thresh0 to first half q
+
+    ldr         r3, [sp, #12]              ; load limit1
+    ldr         r12, [sp, #16]             ; load thresh1
+    vld1.8      {d1}, [r2]                 ; load blimit1 to 2nd half q
+
+    sub         r2, r0, #4                 ; s[-4]
+
+    vld1.8      {d3}, [r3]                 ; load limit1 to 2nd half q
+    vld1.8      {d5}, [r12]                ; load thresh1 to 2nd half q
+
+    vpush       {d8-d15}                   ; save neon registers
+
+    vld1.u8     {d6}, [r2], r1             ; 00 01 02 03 04 05 06 07
+    vld1.u8     {d8}, [r2], r1             ; 10 11 12 13 14 15 16 17
+    vld1.u8     {d10}, [r2], r1            ; 20 21 22 23 24 25 26 27
+    vld1.u8     {d12}, [r2], r1            ; 30 31 32 33 34 35 36 37
+    vld1.u8     {d14}, [r2], r1            ; 40 41 42 43 44 45 46 47
+    vld1.u8     {d16}, [r2], r1            ; 50 51 52 53 54 55 56 57
+    vld1.u8     {d18}, [r2], r1            ; 60 61 62 63 64 65 66 67
+    vld1.u8     {d20}, [r2], r1            ; 70 71 72 73 74 75 76 77
+    vld1.u8     {d7}, [r2], r1             ; 80 81 82 83 84 85 86 87
+    vld1.u8     {d9}, [r2], r1             ; 90 91 92 93 94 95 96 97
+    vld1.u8     {d11}, [r2], r1            ; A0 A1 A2 A3 A4 A5 A6 A7
+    vld1.u8     {d13}, [r2], r1            ; B0 B1 B2 B3 B4 B5 B6 B7
+    vld1.u8     {d15}, [r2], r1            ; C0 C1 C2 C3 C4 C5 C6 C7
+    vld1.u8     {d17}, [r2], r1            ; D0 D1 D2 D3 D4 D5 D6 D7
+    vld1.u8     {d19}, [r2], r1            ; E0 E1 E2 E3 E4 E5 E6 E7
+    vld1.u8     {d21}, [r2]                ; F0 F1 F2 F3 F4 F5 F6 F7
+
+    vtrn.8      q3, q4  ; q3 : 00 10 02 12 04 14 06 16  80 90 82 92 84 94 86 96
+                        ; q4 : 01 11 03 13 05 15 07 17  81 91 83 93 85 95 87 97
+    vtrn.8      q5, q6  ; q5 : 20 30 22 32 24 34 26 36  A0 B0 A2 B2 A4 B4 A6 B6
+                        ; q6 : 21 31 23 33 25 35 27 37  A1 B1 A3 B3 A5 B5 A7 B7
+    vtrn.8      q7, q8  ; q7 : 40 50 42 52 44 54 46 56  C0 D0 C2 D2 C4 D4 C6 D6
+                        ; q8 : 41 51 43 53 45 55 47 57  C1 D1 C3 D3 C5 D5 C7 D7
+    vtrn.8      q9, q10 ; q9 : 60 70 62 72 64 74 66 76  E0 F0 E2 F2 E4 F4 E6 F6
+                        ; q10: 61 71 63 73 65 75 67 77  E1 F1 E3 F3 E5 F5 E7 F7
+
+    vtrn.16     q3, q5  ; q3 : 00 10 20 30 04 14 24 34  80 90 A0 B0 84 94 A4 B4
+                        ; q5 : 02 12 22 32 06 16 26 36  82 92 A2 B2 86 96 A6 B6
+    vtrn.16     q4, q6  ; q4 : 01 11 21 31 05 15 25 35  81 91 A1 B1 85 95 A5 B5
+                        ; q6 : 03 13 23 33 07 17 27 37  83 93 A3 B3 87 97 A7 B7
+    vtrn.16     q7, q9  ; q7 : 40 50 60 70 44 54 64 74  C0 D0 E0 F0 C4 D4 E4 F4
+                        ; q9 : 42 52 62 72 46 56 66 76  C2 D2 E2 F2 C6 D6 E6 F6
+    vtrn.16     q8, q10 ; q8 : 41 51 61 71 45 55 65 75  C1 D1 E1 F1 C5 D5 E5 F5
+                        ; q10: 43 53 63 73 47 57 67 77  C3 D3 E3 F3 C7 D7 E7 F7
+
+    vtrn.32     q3, q7  ; q3 : 00 10 20 30 40 50 60 70  80 90 A0 B0 C0 D0 E0 F0
+                        ; q7 : 04 14 24 34 44 54 64 74  84 94 A4 B4 C4 D4 E4 F4
+    vtrn.32     q5, q9  ; q5 : 02 12 22 32 42 52 62 72  82 92 A2 B2 C2 D2 E2 F2
+                        ; q9 : 06 16 26 36 46 56 66 76  86 96 A6 B6 C6 D6 E6 F6
+    vtrn.32     q4, q8  ; q4 : 01 11 21 31 41 51 61 71  81 91 A1 B1 C1 D1 E1 F1
+                        ; q8 : 05 15 25 35 45 55 65 75  85 95 A5 B5 C5 D5 E5 F5
+    vtrn.32     q6, q10 ; q6 : 03 13 23 33 43 53 63 73  83 93 A3 B3 C3 D3 E3 F3
+                        ; q10: 07 17 27 37 47 57 67 77  87 97 A7 B7 C7 D7 E7 F7
+
+    bl          filter4_16
+
+    sub         r0, #2
+
+    vmov        d0, d11
+    vmov        d1, d13
+    vmov        d2, d15
+    vmov        d3, d17
+    vmov        d11, d12
+    vmov        d12, d14
+    vmov        d13, d16
+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
+    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
+    vst4.8      {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
+    vst4.8      {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
+    vst4.8      {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
+    vst4.8      {d0[4], d1[4], d2[4], d3[4]}, [r0], r1
+    vst4.8      {d0[5], d1[5], d2[5], d3[5]}, [r0], r1
+    vst4.8      {d0[6], d1[6], d2[6], d3[6]}, [r0], r1
+    vst4.8      {d0[7], d1[7], d2[7], d3[7]}, [r0]
+
+    vpop        {d8-d15}                   ; restore neon registers
+
+    pop         {pc}
+    ENDP        ; |vpx_lpf_vertical_4_dual_neon|
+
+; void filter4_16();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. This function uses
+; registers d8-d15, so the calling function must save those registers.
+;
+; r0-r3, r12 PRESERVE
+; q0    blimit
+; q1    limit
+; q2    thresh
+; q3    p3
+; q4    p2
+; q5    p1
+; q6    p0
+; q7    q0
+; q8    q1
+; q9    q2
+; q10   q3
+;
+; Outputs:
+; q5    op1
+; q6    op0
+; q7    oq0
+; q8    oq1
+|filter4_16| PROC
+
+    ; filter_mask
+    vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; m2 = abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; m3 = abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; m4 = abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; m5 = abs(q2 - q1)
+    vabd.u8     q4, q10, q9                 ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     q11, q11, q12               ; m7 = max(m1, m2)
+    vmax.u8     q12, q13, q14               ; m8 = max(m3, m4)
+
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
+    vmax.u8     q3, q3, q4                  ; m9 = max(m5, m6)
+
+    vmov.u8     q10, #0x80
+
+    vmax.u8     q15, q11, q12               ; m10 = max(m7, m8)
+
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     q15, q15, q3                ; m11 = max(m10, m9)
+
+    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
+    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
+
+    veor        q7, q7, q10                 ; qs0
+
+    vcge.u8     q15, q1, q15                ; abs(m11) > limit
+
+    vshr.u8     q2, q2, #1                  ; a = a / 2
+    veor        q6, q6, q10                 ; ps0
+
+    veor        q5, q5, q10                 ; ps1
+    vqadd.u8    q9, q9, q2                  ; a = b + a
+
+    veor        q8, q8, q10                 ; qs1
+
+    vmov.u16    q4, #3
+
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q11, d15, d13
+
+    vcge.u8     q9, q0, q9                  ; a > blimit
+
+    vqsub.s8    q1, q5, q8                  ; filter = clamp(ps1-qs1)
+    vorr        q14, q13, q14               ; hev
+
+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
+    vmul.i16    q11, q11, q4
+
+    vand        q1, q1, q14                 ; filter &= hev
+    vand        q15, q15, q9                ; mask
+
+    vmov.u8     q4, #3
+
+    vaddw.s8    q2, q2, d2                  ; filter + 3 * (qs0 - ps0)
+    vaddw.s8    q11, q11, d3
+
+    vmov.u8     q9, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d2, q2
+    vqmovn.s16  d3, q11
+    vand        q1, q1, q15                 ; filter &= mask
+
+    vqadd.s8    q2, q1, q4                  ; filter2 = clamp(filter+3)
+    vqadd.s8    q1, q1, q9                  ; filter1 = clamp(filter+4)
+    vshr.s8     q2, q2, #3                  ; filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; filter1 >>= 3
+
+
+    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + filter2)
+    vqsub.s8    q0, q7, q1                  ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments
+    vrshr.s8    q1, q1, #1                  ; filter = ++filter1 >> 1
+
+    veor        q7, q0,  q10                ; *oq0 = u^0x80
+
+    vbic        q1, q1, q14                 ; filter &= ~hev
+
+    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + filter)
+    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - filter)
+
+    veor        q6, q11, q10                ; *op0 = u^0x80
+    veor        q5, q13, q10                ; *op1 = u^0x80
+    veor        q8, q12, q10                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |filter4_16|
 
     END
diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm b/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm
deleted file mode 100644
index 5279ecfb7f..0000000000
--- a/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.asm
+++ /dev/null
@@ -1,666 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vpx_lpf_horizontal_edge_8_neon|
-    EXPORT  |vpx_lpf_horizontal_edge_16_neon|
-    EXPORT  |vpx_lpf_vertical_16_neon|
-    EXPORT  |vpx_lpf_vertical_16_dual_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; void mb_lpf_horizontal_edge(uint8_t *s, int p,
-;                             const uint8_t *blimit,
-;                             const uint8_t *limit,
-;                             const uint8_t *thresh,
-;                             int count)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; r12   int count
-|mb_lpf_horizontal_edge| PROC
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]              ; load thresh
-
-h_count
-    vld1.8      {d16[]}, [r2]              ; load *blimit
-    vld1.8      {d17[]}, [r3]              ; load *limit
-    vld1.8      {d18[]}, [r4]              ; load *thresh
-
-    sub         r8, r0, r1, lsl #3         ; move src pointer down by 8 lines
-
-    vld1.u8     {d0}, [r8@64], r1          ; p7
-    vld1.u8     {d1}, [r8@64], r1          ; p6
-    vld1.u8     {d2}, [r8@64], r1          ; p5
-    vld1.u8     {d3}, [r8@64], r1          ; p4
-    vld1.u8     {d4}, [r8@64], r1          ; p3
-    vld1.u8     {d5}, [r8@64], r1          ; p2
-    vld1.u8     {d6}, [r8@64], r1          ; p1
-    vld1.u8     {d7}, [r8@64], r1          ; p0
-    vld1.u8     {d8}, [r8@64], r1          ; q0
-    vld1.u8     {d9}, [r8@64], r1          ; q1
-    vld1.u8     {d10}, [r8@64], r1         ; q2
-    vld1.u8     {d11}, [r8@64], r1         ; q3
-    vld1.u8     {d12}, [r8@64], r1         ; q4
-    vld1.u8     {d13}, [r8@64], r1         ; q5
-    vld1.u8     {d14}, [r8@64], r1         ; q6
-    vld1.u8     {d15}, [r8@64], r1         ; q7
-
-    bl          vpx_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         h_mbfilter
-
-    ; flat && mask were not set for any of the channels. Just store the values
-    ; from filter.
-    sub         r8, r0, r1, lsl #1
-
-    vst1.u8     {d25}, [r8@64], r1         ; store op1
-    vst1.u8     {d24}, [r8@64], r1         ; store op0
-    vst1.u8     {d23}, [r8@64], r1         ; store oq0
-    vst1.u8     {d26}, [r8@64], r1         ; store oq1
-
-    b           h_next
-
-h_mbfilter
-    tst         r7, #2
-    beq         h_wide_mbfilter
-
-    ; flat2 was not set for any of the channels. Just store the values from
-    ; mbfilter.
-    sub         r8, r0, r1, lsl #1
-    sub         r8, r8, r1
-
-    vst1.u8     {d18}, [r8@64], r1         ; store op2
-    vst1.u8     {d19}, [r8@64], r1         ; store op1
-    vst1.u8     {d20}, [r8@64], r1         ; store op0
-    vst1.u8     {d21}, [r8@64], r1         ; store oq0
-    vst1.u8     {d22}, [r8@64], r1         ; store oq1
-    vst1.u8     {d23}, [r8@64], r1         ; store oq2
-
-    b           h_next
-
-h_wide_mbfilter
-    sub         r8, r0, r1, lsl #3
-    add         r8, r8, r1
-
-    vst1.u8     {d16}, [r8@64], r1         ; store op6
-    vst1.u8     {d24}, [r8@64], r1         ; store op5
-    vst1.u8     {d25}, [r8@64], r1         ; store op4
-    vst1.u8     {d26}, [r8@64], r1         ; store op3
-    vst1.u8     {d27}, [r8@64], r1         ; store op2
-    vst1.u8     {d18}, [r8@64], r1         ; store op1
-    vst1.u8     {d19}, [r8@64], r1         ; store op0
-    vst1.u8     {d20}, [r8@64], r1         ; store oq0
-    vst1.u8     {d21}, [r8@64], r1         ; store oq1
-    vst1.u8     {d22}, [r8@64], r1         ; store oq2
-    vst1.u8     {d23}, [r8@64], r1         ; store oq3
-    vst1.u8     {d1}, [r8@64], r1          ; store oq4
-    vst1.u8     {d2}, [r8@64], r1          ; store oq5
-    vst1.u8     {d3}, [r8@64], r1          ; store oq6
-
-h_next
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         h_count
-
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-    ENDP        ; |mb_lpf_horizontal_edge|
-
-; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
-;                                     const uint8_t *blimit,
-;                                     const uint8_t *limit,
-;                                     const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int pitch,
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh
-|vpx_lpf_horizontal_edge_8_neon| PROC
-    mov r12, #1
-    b mb_lpf_horizontal_edge
-    ENDP        ; |vpx_lpf_horizontal_edge_8_neon|
-
-; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
-;                                      const uint8_t *blimit,
-;                                      const uint8_t *limit,
-;                                      const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int pitch,
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh
-|vpx_lpf_horizontal_edge_16_neon| PROC
-    mov r12, #2
-    b mb_lpf_horizontal_edge
-    ENDP        ; |vpx_lpf_horizontal_edge_16_neon|
-
-; void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
-;                             const uint8_t *limit, const uint8_t *thresh,
-;                             int count) {
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; r12   int count
-|mb_lpf_vertical_edge_w| PROC
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]              ; load thresh
-
-v_count
-    vld1.8      {d16[]}, [r2]              ; load *blimit
-    vld1.8      {d17[]}, [r3]              ; load *limit
-    vld1.8      {d18[]}, [r4]              ; load *thresh
-
-    sub         r8, r0, #8
-
-    vld1.8      {d0}, [r8@64], r1
-    vld1.8      {d8}, [r0@64], r1
-    vld1.8      {d1}, [r8@64], r1
-    vld1.8      {d9}, [r0@64], r1
-    vld1.8      {d2}, [r8@64], r1
-    vld1.8      {d10}, [r0@64], r1
-    vld1.8      {d3}, [r8@64], r1
-    vld1.8      {d11}, [r0@64], r1
-    vld1.8      {d4}, [r8@64], r1
-    vld1.8      {d12}, [r0@64], r1
-    vld1.8      {d5}, [r8@64], r1
-    vld1.8      {d13}, [r0@64], r1
-    vld1.8      {d6}, [r8@64], r1
-    vld1.8      {d14}, [r0@64], r1
-    vld1.8      {d7}, [r8@64], r1
-    vld1.8      {d15}, [r0@64], r1
-
-    sub         r0, r0, r1, lsl #3
-
-    vtrn.32     q0, q2
-    vtrn.32     q1, q3
-    vtrn.32     q4, q6
-    vtrn.32     q5, q7
-
-    vtrn.16     q0, q1
-    vtrn.16     q2, q3
-    vtrn.16     q4, q5
-    vtrn.16     q6, q7
-
-    vtrn.8      d0, d1
-    vtrn.8      d2, d3
-    vtrn.8      d4, d5
-    vtrn.8      d6, d7
-
-    vtrn.8      d8, d9
-    vtrn.8      d10, d11
-    vtrn.8      d12, d13
-    vtrn.8      d14, d15
-
-    bl          vpx_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         v_mbfilter
-
-    ; flat && mask were not set for any of the channels. Just store the values
-    ; from filter.
-    sub         r0, #2
-
-    vswp        d23, d25
-
-    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r0], r1
-    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r0], r1
-    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r0], r1
-    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r0], r1
-    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r0], r1
-    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r0], r1
-    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r0], r1
-    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r0], r1
-    add         r0, #2
-
-    b           v_next
-
-v_mbfilter
-    tst         r7, #2
-    beq         v_wide_mbfilter
-
-    ; flat2 was not set for any of the channels. Just store the values from
-    ; mbfilter.
-    sub         r8, r0, #3
-
-    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
-    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
-    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
-    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
-    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
-    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
-    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
-    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
-    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
-    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
-    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
-    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
-    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
-    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
-    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
-    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
-
-    b           v_next
-
-v_wide_mbfilter
-    sub         r8, r0, #8
-
-    vtrn.32     d0,  d26
-    vtrn.32     d16, d27
-    vtrn.32     d24, d18
-    vtrn.32     d25, d19
-
-    vtrn.16     d0,  d24
-    vtrn.16     d16, d25
-    vtrn.16     d26, d18
-    vtrn.16     d27, d19
-
-    vtrn.8      d0,  d16
-    vtrn.8      d24, d25
-    vtrn.8      d26, d27
-    vtrn.8      d18, d19
-
-    vtrn.32     d20, d1
-    vtrn.32     d21, d2
-    vtrn.32     d22, d3
-    vtrn.32     d23, d15
-
-    vtrn.16     d20, d22
-    vtrn.16     d21, d23
-    vtrn.16     d1,  d3
-    vtrn.16     d2,  d15
-
-    vtrn.8      d20, d21
-    vtrn.8      d22, d23
-    vtrn.8      d1,  d2
-    vtrn.8      d3,  d15
-
-    vst1.8      {d0}, [r8@64], r1
-    vst1.8      {d20}, [r0@64], r1
-    vst1.8      {d16}, [r8@64], r1
-    vst1.8      {d21}, [r0@64], r1
-    vst1.8      {d24}, [r8@64], r1
-    vst1.8      {d22}, [r0@64], r1
-    vst1.8      {d25}, [r8@64], r1
-    vst1.8      {d23}, [r0@64], r1
-    vst1.8      {d26}, [r8@64], r1
-    vst1.8      {d1}, [r0@64], r1
-    vst1.8      {d27}, [r8@64], r1
-    vst1.8      {d2}, [r0@64], r1
-    vst1.8      {d18}, [r8@64], r1
-    vst1.8      {d3}, [r0@64], r1
-    vst1.8      {d19}, [r8@64], r1
-    vst1.8      {d15}, [r0@64], r1
-
-v_next
-    subs        r12, #1
-    bne         v_count
-
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-    ENDP        ; |mb_lpf_vertical_edge_w|
-
-; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
-;                               const uint8_t *limit, const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh
-|vpx_lpf_vertical_16_neon| PROC
-    mov r12, #1
-    b mb_lpf_vertical_edge_w
-    ENDP        ; |vpx_lpf_vertical_16_neon|
-
-; void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
-;                                    const uint8_t *limit,
-;                                    const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh
-|vpx_lpf_vertical_16_dual_neon| PROC
-    mov r12, #2
-    b mb_lpf_vertical_edge_w
-    ENDP        ; |vpx_lpf_vertical_16_dual_neon|
-
-; void vpx_wide_mbfilter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store.
-;
-; r0-r3 PRESERVE
-; d16    blimit
-; d17    limit
-; d18    thresh
-; d0    p7
-; d1    p6
-; d2    p5
-; d3    p4
-; d4    p3
-; d5    p2
-; d6    p1
-; d7    p0
-; d8    q0
-; d9    q1
-; d10   q2
-; d11   q3
-; d12   q4
-; d13   q5
-; d14   q6
-; d15   q7
-|vpx_wide_mbfilter_neon| PROC
-    mov         r7, #0
-
-    ; filter_mask
-    vabd.u8     d19, d4, d5                ; abs(p3 - p2)
-    vabd.u8     d20, d5, d6                ; abs(p2 - p1)
-    vabd.u8     d21, d6, d7                ; abs(p1 - p0)
-    vabd.u8     d22, d9, d8                ; abs(q1 - q0)
-    vabd.u8     d23, d10, d9               ; abs(q2 - q1)
-    vabd.u8     d24, d11, d10              ; abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20              ; max(abs(p3 - p2), abs(p2 - p1))
-    vmax.u8     d20, d21, d22              ; max(abs(p1 - p0), abs(q1 - q0))
-    vmax.u8     d23, d23, d24              ; max(abs(q2 - q1), abs(q3 - q2))
-    vmax.u8     d19, d19, d20
-
-    vabd.u8     d24, d7, d8                ; abs(p0 - q0)
-
-    vmax.u8     d19, d19, d23
-
-    vabd.u8     d23, d6, d9                ; a = abs(p1 - q1)
-    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
-
-    ; abs () > limit
-    vcge.u8     d19, d17, d19
-
-    ; flatmask4
-    vabd.u8     d25, d7, d5                ; abs(p0 - p2)
-    vabd.u8     d26, d8, d10               ; abs(q0 - q2)
-    vabd.u8     d27, d4, d7                ; abs(p3 - p0)
-    vabd.u8     d28, d11, d8               ; abs(q3 - q0)
-
-    ; only compare the largest value to thresh
-    vmax.u8     d25, d25, d26              ; max(abs(p0 - p2), abs(q0 - q2))
-    vmax.u8     d26, d27, d28              ; max(abs(p3 - p0), abs(q3 - q0))
-    vmax.u8     d25, d25, d26
-    vmax.u8     d20, d20, d25
-
-    vshr.u8     d23, d23, #1               ; a = a / 2
-    vqadd.u8    d24, d24, d23              ; a = b + a
-
-    vmov.u8     d30, #1
-    vcge.u8     d24, d16, d24              ; (a > blimit * 2 + limit) * -1
-
-    vcge.u8     d20, d30, d20              ; flat
-
-    vand        d19, d19, d24              ; mask
-
-    ; hevmask
-    vcgt.u8     d21, d21, d18              ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d18              ; (abs(q1 - q0) > thresh)*-1
-    vorr        d21, d21, d22              ; hev
-
-    vand        d16, d20, d19              ; flat && mask
-    vmov        r5, r6, d16
-
-    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
-    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
-    vabd.u8     d23, d12, d8               ; abs(q4 - q0)
-    vabd.u8     d24, d7, d2                ; abs(p0 - p5)
-    vabd.u8     d25, d8, d13               ; abs(q0 - q5)
-    vabd.u8     d26, d1, d7                ; abs(p6 - p0)
-    vabd.u8     d27, d14, d8               ; abs(q6 - q0)
-    vabd.u8     d28, d0, d7                ; abs(p7 - p0)
-    vabd.u8     d29, d15, d8               ; abs(q7 - q0)
-
-    ; only compare the largest value to thresh
-    vmax.u8     d22, d22, d23              ; max(abs(p4 - p0), abs(q4 - q0))
-    vmax.u8     d23, d24, d25              ; max(abs(p0 - p5), abs(q0 - q5))
-    vmax.u8     d24, d26, d27              ; max(abs(p6 - p0), abs(q6 - q0))
-    vmax.u8     d25, d28, d29              ; max(abs(p7 - p0), abs(q7 - q0))
-
-    vmax.u8     d26, d22, d23
-    vmax.u8     d27, d24, d25
-    vmax.u8     d23, d26, d27
-
-    vcge.u8     d18, d30, d23              ; flat2
-
-    vmov.u8     d22, #0x80
-
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #1                 ; Only do filter branch
-
-    vand        d17, d18, d16              ; flat2 && flat && mask
-    vmov        r5, r6, d17
-
-    ; mbfilter() function
-
-    ; filter() function
-    ; convert to signed
-    veor        d23, d8, d22               ; qs0
-    veor        d24, d7, d22               ; ps0
-    veor        d25, d6, d22               ; ps1
-    veor        d26, d9, d22               ; qs1
-
-    vmov.u8     d27, #3
-
-    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
-    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
-    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
-    vand        d29, d29, d21              ; filter &= hev
-    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
-    vmov.u8     d29, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d28, q15
-
-    vand        d28, d28, d19              ; filter &= mask
-
-    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
-    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
-    vshr.s8     d30, d30, #3               ; filter2 >>= 3
-    vshr.s8     d29, d29, #3               ; filter1 >>= 3
-
-
-    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
-    vqsub.s8    d23, d23, d29              ; oq0 = clamp(qs0 - filter1)
-
-    ; outer tap adjustments: ++filter1 >> 1
-    vrshr.s8    d29, d29, #1
-    vbic        d29, d29, d21              ; filter &= ~hev
-
-    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
-    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
-
-    veor        d24, d24, d22              ; *f_op0 = u^0x80
-    veor        d23, d23, d22              ; *f_oq0 = u^0x80
-    veor        d25, d25, d22              ; *f_op1 = u^0x80
-    veor        d26, d26, d22              ; *f_oq1 = u^0x80
-
-    tst         r7, #1
-    bxne        lr
-
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #2                 ; Only do mbfilter branch
-
-    ; mbfilter flat && mask branch
-    ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
-    ; and using vibt on the q's?
-    vmov.u8     d29, #2
-    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
-    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
-    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
-    vaddl.u8    q10, d4, d5
-    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
-    vaddl.u8    q14, d6, d9
-    vqrshrn.u16 d18, q15, #3               ; r_op2
-
-    vsub.i16    q15, q10
-    vaddl.u8    q10, d4, d6
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d7, d10
-    vqrshrn.u16 d19, q15, #3               ; r_op1
-
-    vsub.i16    q15, q10
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d8, d11
-    vqrshrn.u16 d20, q15, #3               ; r_op0
-
-    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
-    vsubw.u8    q15, d7                    ; oq0 -= p0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d9, d11
-    vqrshrn.u16 d21, q15, #3               ; r_oq0
-
-    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
-    vsubw.u8    q15, d8                    ; oq1 -= q0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d10, d11
-    vqrshrn.u16 d22, q15, #3               ; r_oq1
-
-    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
-    vsubw.u8    q15, d9                    ; oq2 -= q1
-    vadd.i16    q15, q14
-    vqrshrn.u16 d27, q15, #3               ; r_oq2
-
-    ; Filter does not set op2 or oq2, so use p2 and q2.
-    vbif        d18, d5, d16               ; t_op2 |= p2 & ~(flat & mask)
-    vbif        d19, d25, d16              ; t_op1 |= f_op1 & ~(flat & mask)
-    vbif        d20, d24, d16              ; t_op0 |= f_op0 & ~(flat & mask)
-    vbif        d21, d23, d16              ; t_oq0 |= f_oq0 & ~(flat & mask)
-    vbif        d22, d26, d16              ; t_oq1 |= f_oq1 & ~(flat & mask)
-
-    vbit        d23, d27, d16              ; t_oq2 |= r_oq2 & (flat & mask)
-    vbif        d23, d10, d16              ; t_oq2 |= q2 & ~(flat & mask)
-
-    tst         r7, #2
-    bxne        lr
-
-    ; wide_mbfilter flat2 && flat && mask branch
-    vmov.u8     d16, #7
-    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
-    vaddl.u8    q12, d2, d3
-    vaddl.u8    q13, d4, d5
-    vaddl.u8    q14, d1, d6
-    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
-    vadd.i16    q12, q13
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vadd.i16    q15, q12
-    vaddl.u8    q12, d0, d1
-    vaddw.u8    q15, d1
-    vaddl.u8    q13, d0, d2
-    vadd.i16    q14, q15, q14
-    vqrshrn.u16 d16, q15, #4               ; w_op6
-
-    vsub.i16    q15, q14, q12
-    vaddl.u8    q14, d3, d10
-    vqrshrn.u16 d24, q15, #4               ; w_op5
-
-    vsub.i16    q15, q13
-    vaddl.u8    q13, d0, d3
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vqrshrn.u16 d25, q15, #4               ; w_op4
-
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d0, d4
-    vsub.i16    q15, q13
-    vsub.i16    q14, q15, q14
-    vqrshrn.u16 d26, q15, #4               ; w_op3
-
-    vaddw.u8    q15, q14, d5               ; op2 += p2
-    vaddl.u8    q14, d0, d5
-    vaddw.u8    q15, d12                   ; op2 += q4
-    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
-    vqrshrn.u16 d27, q15, #4               ; w_op2
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d6
-    vaddw.u8    q15, d6                    ; op1 += p1
-    vaddw.u8    q15, d13                   ; op1 += q5
-    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
-    vqrshrn.u16 d18, q15, #4               ; w_op1
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d7
-    vaddw.u8    q15, d7                    ; op0 += p0
-    vaddw.u8    q15, d14                   ; op0 += q6
-    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
-    vqrshrn.u16 d19, q15, #4               ; w_op0
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d1, d8
-    vaddw.u8    q15, d8                    ; oq0 += q0
-    vaddw.u8    q15, d15                   ; oq0 += q7
-    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
-    vqrshrn.u16 d20, q15, #4               ; w_oq0
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vaddw.u8    q15, d9                    ; oq1 += q1
-    vaddl.u8    q4, d10, d15
-    vaddw.u8    q15, d15                   ; oq1 += q7
-    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
-    vqrshrn.u16 d21, q15, #4               ; w_oq1
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d3, d10
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d11, d15
-    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
-    vqrshrn.u16 d22, q15, #4               ; w_oq2
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d12, d15
-    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
-    vqrshrn.u16 d23, q15, #4               ; w_oq3
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d5, d12
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d13, d15
-    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
-    vqrshrn.u16 d1, q15, #4                ; w_oq4
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d6, d13
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d14, d15
-    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
-    vqrshrn.u16 d2, q15, #4                ; w_oq5
-
-    vsub.i16    q15, q14
-    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
-    vadd.i16    q15, q4
-    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
-    vqrshrn.u16 d3, q15, #4                ; w_oq6
-    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
-    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
-    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
-
-    bx          lr
-    ENDP        ; |vpx_wide_mbfilter_neon|
-
-    END
diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.c b/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.c
deleted file mode 100644
index f95267472c..0000000000
--- a/libs/libvpx/vpx_dsp/arm/loopfilter_mb_neon.c
+++ /dev/null
@@ -1,1093 +0,0 @@
-/*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/arm/transpose_neon.h"
-
-// For all the static inline functions, the functions ending with '_8' process
-// 8 samples in a bunch, and the functions ending with '_16' process 16 samples
-// in a bunch.
-
-#define FUN_LOAD_THRESH(w, r)                                             \
-  static INLINE void load_thresh_##w(                                     \
-      const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \
-      uint8x##w##_t *blimit_vec, uint8x##w##_t *limit_vec,                \
-      uint8x##w##_t *thresh_vec) {                                        \
-    *blimit_vec = vld1##r##dup_u8(blimit);                                \
-    *limit_vec = vld1##r##dup_u8(limit);                                  \
-    *thresh_vec = vld1##r##dup_u8(thresh);                                \
-  }
-
-FUN_LOAD_THRESH(8, _)    // load_thresh_8
-FUN_LOAD_THRESH(16, q_)  // load_thresh_16
-#undef FUN_LOAD_THRESH
-
-static INLINE void load_thresh_8_dual(
-    const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0,
-    const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,
-    uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) {
-  *blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1));
-  *limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1));
-  *thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1));
-}
-
-// Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a
-// pixel. When used to control filter branches, we only detect whether it is all
-// 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
-// flat equals 0 if and only if flat_status equals 0.
-// flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true
-// because each mask occupies more than 1 bit.)
-static INLINE uint32_t calc_flat_status_8(uint8x8_t flat) {
-  return vget_lane_u32(
-      vreinterpret_u32_u64(vpaddl_u32(vreinterpret_u32_u8(flat))), 0);
-}
-
-// Here flat is 128-bit long, with each 8-bit chunk being a mask of a pixel.
-// When used to control filter branches, we only detect whether it is all 0s or
-// all 1s. We narrowing shift right each 16-bit chunk by 4 arithmetically, so
-// we get a 64-bit long number, with each 4-bit chunk being a mask of a pixel.
-// Then we pairwise add flat to a 32-bit long number flat_status.
-// flat equals 0 if and only if flat_status equals 0.
-// flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true
-// because each mask occupies more than 1 bit.)
-static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) {
-  const uint8x8_t flat_4bit =
-      vreinterpret_u8_s8(vshrn_n_s16(vreinterpretq_s16_u8(flat), 4));
-  return calc_flat_status_8(flat_4bit);
-}
-
-#define FUN_FILTER_HEV_MASK4(w, r)                                            \
-  static INLINE uint8x##w##_t filter_hev_mask4_##w(                           \
-      const uint8x##w##_t limit, const uint8x##w##_t blimit,                  \
-      const uint8x##w##_t thresh, const uint8x##w##_t p3,                     \
-      const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
-      const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
-      const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) {      \
-    uint8x##w##_t max, t0, t1;                                                \
-                                                                              \
-    max = vabd##r##u8(p1, p0);                                                \
-    max = vmax##r##u8(max, vabd##r##u8(q1, q0));                              \
-    *hev = vcgt##r##u8(max, thresh);                                          \
-    *mask = vmax##r##u8(max, vabd##r##u8(p3, p2));                            \
-    *mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1));                          \
-    *mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1));                          \
-    *mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2));                          \
-    t0 = vabd##r##u8(p0, q0);                                                 \
-    t1 = vabd##r##u8(p1, q1);                                                 \
-    t0 = vqadd##r##u8(t0, t0);                                                \
-    t1 = vshr##r##n_u8(t1, 1);                                                \
-    t0 = vqadd##r##u8(t0, t1);                                                \
-    *mask = vcle##r##u8(*mask, limit);                                        \
-    t0 = vcle##r##u8(t0, blimit);                                             \
-    *mask = vand##r##u8(*mask, t0);                                           \
-                                                                              \
-    return max;                                                               \
-  }
-
-FUN_FILTER_HEV_MASK4(8, _)    // filter_hev_mask4_8
-FUN_FILTER_HEV_MASK4(16, q_)  // filter_hev_mask4_16
-#undef FUN_FILTER_HEV_MASK4
-
-#define FUN_FILTER_FLAT_HEV_MASK(w, r)                                        \
-  static INLINE uint8x##w##_t filter_flat_hev_mask_##w(                       \
-      const uint8x##w##_t limit, const uint8x##w##_t blimit,                  \
-      const uint8x##w##_t thresh, const uint8x##w##_t p3,                     \
-      const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
-      const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
-      const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status,     \
-      uint8x##w##_t *hev) {                                                   \
-    uint8x##w##_t max, mask;                                                  \
-                                                                              \
-    max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \
-                               q2, q3, hev, &mask);                           \
-    *flat = vmax##r##u8(max, vabd##r##u8(p2, p0));                            \
-    *flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0));                          \
-    *flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0));                          \
-    *flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0));                          \
-    *flat = vcle##r##u8(*flat, vdup##r##n_u8(1)); /* flat_mask4() */          \
-    *flat = vand##r##u8(*flat, mask);                                         \
-    *flat_status = calc_flat_status_##w(*flat);                               \
-                                                                              \
-    return mask;                                                              \
-  }
-
-FUN_FILTER_FLAT_HEV_MASK(8, _)    // filter_flat_hev_mask_8
-FUN_FILTER_FLAT_HEV_MASK(16, q_)  // filter_flat_hev_mask_16
-#undef FUN_FILTER_FLAT_HEV_MASK
-
-#define FUN_FLAT_MASK5(w, r)                                                  \
-  static INLINE uint8x##w##_t flat_mask5_##w(                                 \
-      const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
-      const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
-      const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
-      const uint8x##w##_t q4, const uint8x##w##_t flat,                       \
-      uint32_t *flat2_status) {                                               \
-    uint8x##w##_t flat2 = vabd##r##u8(p4, p0);                                \
-    flat2 = vmax##r##u8(flat2, vabd##r##u8(p3, p0));                          \
-    flat2 = vmax##r##u8(flat2, vabd##r##u8(p2, p0));                          \
-    flat2 = vmax##r##u8(flat2, vabd##r##u8(p1, p0));                          \
-    flat2 = vmax##r##u8(flat2, vabd##r##u8(q1, q0));                          \
-    flat2 = vmax##r##u8(flat2, vabd##r##u8(q2, q0));                          \
-    flat2 = vmax##r##u8(flat2, vabd##r##u8(q3, q0));                          \
-    flat2 = vmax##r##u8(flat2, vabd##r##u8(q4, q0));                          \
-    flat2 = vcle##r##u8(flat2, vdup##r##n_u8(1));                             \
-    flat2 = vand##r##u8(flat2, flat);                                         \
-    *flat2_status = calc_flat_status_##w(flat2);                              \
-                                                                              \
-    return flat2;                                                             \
-  }
-
-FUN_FLAT_MASK5(8, _)    // flat_mask5_8
-FUN_FLAT_MASK5(16, q_)  // flat_mask5_16
-#undef FUN_FLAT_MASK5
-
-#define FUN_FLIP_SIGN(w, r)                                         \
-  static INLINE int8x##w##_t flip_sign_##w(const uint8x##w##_t v) { \
-    const uint8x##w##_t sign_bit = vdup##r##n_u8(0x80);             \
-    return vreinterpret##r##s8_u8(veor##r##u8(v, sign_bit));        \
-  }
-
-FUN_FLIP_SIGN(8, _)    // flip_sign_8
-FUN_FLIP_SIGN(16, q_)  // flip_sign_16
-#undef FUN_FLIP_SIGN
-
-#define FUN_FLIP_SIGN_BACK(w, r)                                         \
-  static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \
-    const int8x##w##_t sign_bit = vdup##r##n_s8(0x80);                   \
-    return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit));             \
-  }
-
-FUN_FLIP_SIGN_BACK(8, _)    // flip_sign_back_8
-FUN_FLIP_SIGN_BACK(16, q_)  // flip_sign_back_16
-#undef FUN_FLIP_SIGN_BACK
-
-static INLINE void filter_update_8(const uint8x8_t sub0, const uint8x8_t sub1,
-                                   const uint8x8_t add0, const uint8x8_t add1,
-                                   uint16x8_t *sum) {
-  *sum = vsubw_u8(*sum, sub0);
-  *sum = vsubw_u8(*sum, sub1);
-  *sum = vaddw_u8(*sum, add0);
-  *sum = vaddw_u8(*sum, add1);
-}
-
-static INLINE void filter_update_16(const uint8x16_t sub0,
-                                    const uint8x16_t sub1,
-                                    const uint8x16_t add0,
-                                    const uint8x16_t add1, uint16x8_t *sum0,
-                                    uint16x8_t *sum1) {
-  *sum0 = vsubw_u8(*sum0, vget_low_u8(sub0));
-  *sum1 = vsubw_u8(*sum1, vget_high_u8(sub0));
-  *sum0 = vsubw_u8(*sum0, vget_low_u8(sub1));
-  *sum1 = vsubw_u8(*sum1, vget_high_u8(sub1));
-  *sum0 = vaddw_u8(*sum0, vget_low_u8(add0));
-  *sum1 = vaddw_u8(*sum1, vget_high_u8(add0));
-  *sum0 = vaddw_u8(*sum0, vget_low_u8(add1));
-  *sum1 = vaddw_u8(*sum1, vget_high_u8(add1));
-}
-
-static INLINE uint8x8_t calc_7_tap_filter_8_kernel(const uint8x8_t sub0,
-                                                   const uint8x8_t sub1,
-                                                   const uint8x8_t add0,
-                                                   const uint8x8_t add1,
-                                                   uint16x8_t *sum) {
-  filter_update_8(sub0, sub1, add0, add1, sum);
-  return vrshrn_n_u16(*sum, 3);
-}
-
-static INLINE uint8x16_t calc_7_tap_filter_16_kernel(
-    const uint8x16_t sub0, const uint8x16_t sub1, const uint8x16_t add0,
-    const uint8x16_t add1, uint16x8_t *sum0, uint16x8_t *sum1) {
-  filter_update_16(sub0, sub1, add0, add1, sum0, sum1);
-  return vcombine_u8(vrshrn_n_u16(*sum0, 3), vrshrn_n_u16(*sum1, 3));
-}
-
-static INLINE uint8x8_t apply_15_tap_filter_8_kernel(
-    const uint8x8_t flat, const uint8x8_t sub0, const uint8x8_t sub1,
-    const uint8x8_t add0, const uint8x8_t add1, const uint8x8_t in,
-    uint16x8_t *sum) {
-  filter_update_8(sub0, sub1, add0, add1, sum);
-  return vbsl_u8(flat, vrshrn_n_u16(*sum, 4), in);
-}
-
-static INLINE uint8x16_t apply_15_tap_filter_16_kernel(
-    const uint8x16_t flat, const uint8x16_t sub0, const uint8x16_t sub1,
-    const uint8x16_t add0, const uint8x16_t add1, const uint8x16_t in,
-    uint16x8_t *sum0, uint16x8_t *sum1) {
-  uint8x16_t t;
-  filter_update_16(sub0, sub1, add0, add1, sum0, sum1);
-  t = vcombine_u8(vrshrn_n_u16(*sum0, 4), vrshrn_n_u16(*sum1, 4));
-  return vbslq_u8(flat, t, in);
-}
-
-// 7-tap filter [1, 1, 1, 2, 1, 1, 1]
-static INLINE void calc_7_tap_filter_8(const uint8x8_t p3, const uint8x8_t p2,
-                                       const uint8x8_t p1, const uint8x8_t p0,
-                                       const uint8x8_t q0, const uint8x8_t q1,
-                                       const uint8x8_t q2, const uint8x8_t q3,
-                                       uint8x8_t *op2, uint8x8_t *op1,
-                                       uint8x8_t *op0, uint8x8_t *oq0,
-                                       uint8x8_t *oq1, uint8x8_t *oq2) {
-  uint16x8_t sum;
-  sum = vaddl_u8(p3, p3);   // 2*p3
-  sum = vaddw_u8(sum, p3);  // 3*p3
-  sum = vaddw_u8(sum, p2);  // 3*p3+p2
-  sum = vaddw_u8(sum, p2);  // 3*p3+2*p2
-  sum = vaddw_u8(sum, p1);  // 3*p3+2*p2+p1
-  sum = vaddw_u8(sum, p0);  // 3*p3+2*p2+p1+p0
-  sum = vaddw_u8(sum, q0);  // 3*p3+2*p2+p1+p0+q0
-  *op2 = vrshrn_n_u16(sum, 3);
-  *op1 = calc_7_tap_filter_8_kernel(p3, p2, p1, q1, &sum);
-  *op0 = calc_7_tap_filter_8_kernel(p3, p1, p0, q2, &sum);
-  *oq0 = calc_7_tap_filter_8_kernel(p3, p0, q0, q3, &sum);
-  *oq1 = calc_7_tap_filter_8_kernel(p2, q0, q1, q3, &sum);
-  *oq2 = calc_7_tap_filter_8_kernel(p1, q1, q2, q3, &sum);
-}
-
-static INLINE void calc_7_tap_filter_16(
-    const uint8x16_t p3, const uint8x16_t p2, const uint8x16_t p1,
-    const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1,
-    const uint8x16_t q2, const uint8x16_t q3, uint8x16_t *op2, uint8x16_t *op1,
-    uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2) {
-  uint16x8_t sum0, sum1;
-  sum0 = vaddl_u8(vget_low_u8(p3), vget_low_u8(p3));    // 2*p3
-  sum1 = vaddl_u8(vget_high_u8(p3), vget_high_u8(p3));  // 2*p3
-  sum0 = vaddw_u8(sum0, vget_low_u8(p3));               // 3*p3
-  sum1 = vaddw_u8(sum1, vget_high_u8(p3));              // 3*p3
-  sum0 = vaddw_u8(sum0, vget_low_u8(p2));               // 3*p3+p2
-  sum1 = vaddw_u8(sum1, vget_high_u8(p2));              // 3*p3+p2
-  sum0 = vaddw_u8(sum0, vget_low_u8(p2));               // 3*p3+2*p2
-  sum1 = vaddw_u8(sum1, vget_high_u8(p2));              // 3*p3+2*p2
-  sum0 = vaddw_u8(sum0, vget_low_u8(p1));               // 3*p3+2*p2+p1
-  sum1 = vaddw_u8(sum1, vget_high_u8(p1));              // 3*p3+2*p2+p1
-  sum0 = vaddw_u8(sum0, vget_low_u8(p0));               // 3*p3+2*p2+p1+p0
-  sum1 = vaddw_u8(sum1, vget_high_u8(p0));              // 3*p3+2*p2+p1+p0
-  sum0 = vaddw_u8(sum0, vget_low_u8(q0));               // 3*p3+2*p2+p1+p0+q0
-  sum1 = vaddw_u8(sum1, vget_high_u8(q0));              // 3*p3+2*p2+p1+p0+q0
-  *op2 = vcombine_u8(vrshrn_n_u16(sum0, 3), vrshrn_n_u16(sum1, 3));
-  *op1 = calc_7_tap_filter_16_kernel(p3, p2, p1, q1, &sum0, &sum1);
-  *op0 = calc_7_tap_filter_16_kernel(p3, p1, p0, q2, &sum0, &sum1);
-  *oq0 = calc_7_tap_filter_16_kernel(p3, p0, q0, q3, &sum0, &sum1);
-  *oq1 = calc_7_tap_filter_16_kernel(p2, q0, q1, q3, &sum0, &sum1);
-  *oq2 = calc_7_tap_filter_16_kernel(p1, q1, q2, q3, &sum0, &sum1);
-}
-
-#define FUN_APPLY_7_TAP_FILTER(w, r)                                          \
-  static INLINE void apply_7_tap_filter_##w(                                  \
-      const uint8x##w##_t flat, const uint8x##w##_t p3,                       \
-      const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
-      const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
-      const uint8x##w##_t q3, uint8x##w##_t *op2, uint8x##w##_t *op1,         \
-      uint8x##w##_t *op0, uint8x##w##_t *oq0, uint8x##w##_t *oq1,             \
-      uint8x##w##_t *oq2) {                                                   \
-    uint8x##w##_t tp1, tp0, tq0, tq1;                                         \
-    calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0,    \
-                          &tq0, &tq1, oq2);                                   \
-    *op2 = vbsl##r##u8(flat, *op2, p2);                                       \
-    *op1 = vbsl##r##u8(flat, tp1, *op1);                                      \
-    *op0 = vbsl##r##u8(flat, tp0, *op0);                                      \
-    *oq0 = vbsl##r##u8(flat, tq0, *oq0);                                      \
-    *oq1 = vbsl##r##u8(flat, tq1, *oq1);                                      \
-    *oq2 = vbsl##r##u8(flat, *oq2, q2);                                       \
-  }
-
-FUN_APPLY_7_TAP_FILTER(8, _)    // apply_7_tap_filter_8
-FUN_APPLY_7_TAP_FILTER(16, q_)  // apply_7_tap_filter_16
-#undef FUN_APPLY_7_TAP_FILTER
-
-// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
-static INLINE void apply_15_tap_filter_8(
-    const uint8x8_t flat2, const uint8x8_t p7, const uint8x8_t p6,
-    const uint8x8_t p5, const uint8x8_t p4, const uint8x8_t p3,
-    const uint8x8_t p2, const uint8x8_t p1, const uint8x8_t p0,
-    const uint8x8_t q0, const uint8x8_t q1, const uint8x8_t q2,
-    const uint8x8_t q3, const uint8x8_t q4, const uint8x8_t q5,
-    const uint8x8_t q6, const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5,
-    uint8x8_t *op4, uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1,
-    uint8x8_t *op0, uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2,
-    uint8x8_t *oq3, uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) {
-  uint16x8_t sum;
-  sum = vshll_n_u8(p7, 3);  // 8*p7
-  sum = vsubw_u8(sum, p7);  // 7*p7
-  sum = vaddw_u8(sum, p6);  // 7*p7+p6
-  sum = vaddw_u8(sum, p6);  // 7*p7+2*p6
-  sum = vaddw_u8(sum, p5);  // 7*p7+2*p6+p5
-  sum = vaddw_u8(sum, p4);  // 7*p7+2*p6+p5+p4
-  sum = vaddw_u8(sum, p3);  // 7*p7+2*p6+p5+p4+p3
-  sum = vaddw_u8(sum, p2);  // 7*p7+2*p6+p5+p4+p3+p2
-  sum = vaddw_u8(sum, p1);  // 7*p7+2*p6+p5+p4+p3+p2+p1
-  sum = vaddw_u8(sum, p0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
-  sum = vaddw_u8(sum, q0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
-  *op6 = vbsl_u8(flat2, vrshrn_n_u16(sum, 4), p6);
-  *op5 = apply_15_tap_filter_8_kernel(flat2, p7, p6, p5, q1, p5, &sum);
-  *op4 = apply_15_tap_filter_8_kernel(flat2, p7, p5, p4, q2, p4, &sum);
-  *op3 = apply_15_tap_filter_8_kernel(flat2, p7, p4, p3, q3, p3, &sum);
-  *op2 = apply_15_tap_filter_8_kernel(flat2, p7, p3, p2, q4, *op2, &sum);
-  *op1 = apply_15_tap_filter_8_kernel(flat2, p7, p2, p1, q5, *op1, &sum);
-  *op0 = apply_15_tap_filter_8_kernel(flat2, p7, p1, p0, q6, *op0, &sum);
-  *oq0 = apply_15_tap_filter_8_kernel(flat2, p7, p0, q0, q7, *oq0, &sum);
-  *oq1 = apply_15_tap_filter_8_kernel(flat2, p6, q0, q1, q7, *oq1, &sum);
-  *oq2 = apply_15_tap_filter_8_kernel(flat2, p5, q1, q2, q7, *oq2, &sum);
-  *oq3 = apply_15_tap_filter_8_kernel(flat2, p4, q2, q3, q7, q3, &sum);
-  *oq4 = apply_15_tap_filter_8_kernel(flat2, p3, q3, q4, q7, q4, &sum);
-  *oq5 = apply_15_tap_filter_8_kernel(flat2, p2, q4, q5, q7, q5, &sum);
-  *oq6 = apply_15_tap_filter_8_kernel(flat2, p1, q5, q6, q7, q6, &sum);
-}
-
-static INLINE void apply_15_tap_filter_16(
-    const uint8x16_t flat2, const uint8x16_t p7, const uint8x16_t p6,
-    const uint8x16_t p5, const uint8x16_t p4, const uint8x16_t p3,
-    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
-    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
-    const uint8x16_t q3, const uint8x16_t q4, const uint8x16_t q5,
-    const uint8x16_t q6, const uint8x16_t q7, uint8x16_t *op6, uint8x16_t *op5,
-    uint8x16_t *op4, uint8x16_t *op3, uint8x16_t *op2, uint8x16_t *op1,
-    uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2,
-    uint8x16_t *oq3, uint8x16_t *oq4, uint8x16_t *oq5, uint8x16_t *oq6) {
-  uint16x8_t sum0, sum1;
-  uint8x16_t t;
-  sum0 = vshll_n_u8(vget_low_u8(p7), 3);    // 8*p7
-  sum1 = vshll_n_u8(vget_high_u8(p7), 3);   // 8*p7
-  sum0 = vsubw_u8(sum0, vget_low_u8(p7));   // 7*p7
-  sum1 = vsubw_u8(sum1, vget_high_u8(p7));  // 7*p7
-  sum0 = vaddw_u8(sum0, vget_low_u8(p6));   // 7*p7+p6
-  sum1 = vaddw_u8(sum1, vget_high_u8(p6));  // 7*p7+p6
-  sum0 = vaddw_u8(sum0, vget_low_u8(p6));   // 7*p7+2*p6
-  sum1 = vaddw_u8(sum1, vget_high_u8(p6));  // 7*p7+2*p6
-  sum0 = vaddw_u8(sum0, vget_low_u8(p5));   // 7*p7+2*p6+p5
-  sum1 = vaddw_u8(sum1, vget_high_u8(p5));  // 7*p7+2*p6+p5
-  sum0 = vaddw_u8(sum0, vget_low_u8(p4));   // 7*p7+2*p6+p5+p4
-  sum1 = vaddw_u8(sum1, vget_high_u8(p4));  // 7*p7+2*p6+p5+p4
-  sum0 = vaddw_u8(sum0, vget_low_u8(p3));   // 7*p7+2*p6+p5+p4+p3
-  sum1 = vaddw_u8(sum1, vget_high_u8(p3));  // 7*p7+2*p6+p5+p4+p3
-  sum0 = vaddw_u8(sum0, vget_low_u8(p2));   // 7*p7+2*p6+p5+p4+p3+p2
-  sum1 = vaddw_u8(sum1, vget_high_u8(p2));  // 7*p7+2*p6+p5+p4+p3+p2
-  sum0 = vaddw_u8(sum0, vget_low_u8(p1));   // 7*p7+2*p6+p5+p4+p3+p2+p1
-  sum1 = vaddw_u8(sum1, vget_high_u8(p1));  // 7*p7+2*p6+p5+p4+p3+p2+p1
-  sum0 = vaddw_u8(sum0, vget_low_u8(p0));   // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
-  sum1 = vaddw_u8(sum1, vget_high_u8(p0));  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
-  sum0 = vaddw_u8(sum0, vget_low_u8(q0));   // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
-  sum1 = vaddw_u8(sum1, vget_high_u8(q0));  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
-  t = vcombine_u8(vrshrn_n_u16(sum0, 4), vrshrn_n_u16(sum1, 4));
-  *op6 = vbslq_u8(flat2, t, p6);
-  *op5 = apply_15_tap_filter_16_kernel(flat2, p7, p6, p5, q1, p5, &sum0, &sum1);
-  *op4 = apply_15_tap_filter_16_kernel(flat2, p7, p5, p4, q2, p4, &sum0, &sum1);
-  *op3 = apply_15_tap_filter_16_kernel(flat2, p7, p4, p3, q3, p3, &sum0, &sum1);
-  *op2 =
-      apply_15_tap_filter_16_kernel(flat2, p7, p3, p2, q4, *op2, &sum0, &sum1);
-  *op1 =
-      apply_15_tap_filter_16_kernel(flat2, p7, p2, p1, q5, *op1, &sum0, &sum1);
-  *op0 =
-      apply_15_tap_filter_16_kernel(flat2, p7, p1, p0, q6, *op0, &sum0, &sum1);
-  *oq0 =
-      apply_15_tap_filter_16_kernel(flat2, p7, p0, q0, q7, *oq0, &sum0, &sum1);
-  *oq1 =
-      apply_15_tap_filter_16_kernel(flat2, p6, q0, q1, q7, *oq1, &sum0, &sum1);
-  *oq2 =
-      apply_15_tap_filter_16_kernel(flat2, p5, q1, q2, q7, *oq2, &sum0, &sum1);
-  *oq3 = apply_15_tap_filter_16_kernel(flat2, p4, q2, q3, q7, q3, &sum0, &sum1);
-  *oq4 = apply_15_tap_filter_16_kernel(flat2, p3, q3, q4, q7, q4, &sum0, &sum1);
-  *oq5 = apply_15_tap_filter_16_kernel(flat2, p2, q4, q5, q7, q5, &sum0, &sum1);
-  *oq6 = apply_15_tap_filter_16_kernel(flat2, p1, q5, q6, q7, q6, &sum0, &sum1);
-}
-
-#define FUN_FILTER4(w, r)                                                     \
-  static INLINE void filter4_##w(                                             \
-      const uint8x##w##_t mask, const uint8x##w##_t hev,                      \
-      const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
-      const uint8x##w##_t q1, uint8x##w##_t *op1, uint8x##w##_t *op0,         \
-      uint8x##w##_t *oq0, uint8x##w##_t *oq1) {                               \
-    int8x##w##_t filter, filter1, filter2, t;                                 \
-    int8x##w##_t ps1 = flip_sign_##w(p1);                                     \
-    int8x##w##_t ps0 = flip_sign_##w(p0);                                     \
-    int8x##w##_t qs0 = flip_sign_##w(q0);                                     \
-    int8x##w##_t qs1 = flip_sign_##w(q1);                                     \
-                                                                              \
-    /* add outer taps if we have high edge variance */                        \
-    filter = vqsub##r##s8(ps1, qs1);                                          \
-    filter = vand##r##s8(filter, vreinterpret##r##s8_u8(hev));                \
-    t = vqsub##r##s8(qs0, ps0);                                               \
-                                                                              \
-    /* inner taps */                                                          \
-    filter = vqadd##r##s8(filter, t);                                         \
-    filter = vqadd##r##s8(filter, t);                                         \
-    filter = vqadd##r##s8(filter, t);                                         \
-    filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask));               \
-                                                                              \
-    /* save bottom 3 bits so that we round one side +4 and the other +3 */    \
-    /* if it equals 4 we'll set to adjust by -1 to account for the fact */    \
-    /* we'd round 3 the other way */                                          \
-    filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3);       \
-    filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3);       \
-                                                                              \
-    qs0 = vqsub##r##s8(qs0, filter1);                                         \
-    ps0 = vqadd##r##s8(ps0, filter2);                                         \
-    *oq0 = flip_sign_back_##w(qs0);                                           \
-    *op0 = flip_sign_back_##w(ps0);                                           \
-                                                                              \
-    /* outer tap adjustments */                                               \
-    filter = vrshr##r##n_s8(filter1, 1);                                      \
-    filter = vbic##r##s8(filter, vreinterpret##r##s8_u8(hev));                \
-                                                                              \
-    qs1 = vqsub##r##s8(qs1, filter);                                          \
-    ps1 = vqadd##r##s8(ps1, filter);                                          \
-    *oq1 = flip_sign_back_##w(qs1);                                           \
-    *op1 = flip_sign_back_##w(ps1);                                           \
-  }
-
-FUN_FILTER4(8, _)    // filter4_8
-FUN_FILTER4(16, q_)  // filter4_16
-#undef FUN_FILTER4
-
-#define FUN_FILTER8(w)                                                         \
-  static INLINE void filter8_##w(                                              \
-      const uint8x##w##_t mask, const uint8x##w##_t flat,                      \
-      const uint32_t flat_status, const uint8x##w##_t hev,                     \
-      const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1,  \
-      const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1,  \
-      const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2,      \
-      uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0,              \
-      uint8x##w##_t *oq1, uint8x##w##_t *oq2) {                                \
-    if (flat_status != (uint32_t)-2) {                                         \
-      filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1);              \
-      *op2 = p2;                                                               \
-      *oq2 = q2;                                                               \
-      if (flat_status) {                                                       \
-        apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
-                               op0, oq0, oq1, oq2);                            \
-      }                                                                        \
-    } else {                                                                   \
-      calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,     \
-                            oq0, oq1, oq2);                                    \
-    }                                                                          \
-  }
-
-FUN_FILTER8(8)   // filter8_8
-FUN_FILTER8(16)  // filter8_16
-#undef FUN_FILTER8
-
-#define FUN_FILTER16(w)                                                        \
-  static INLINE void filter16_##w(                                             \
-      const uint8x##w##_t mask, const uint8x##w##_t flat,                      \
-      const uint32_t flat_status, const uint8x##w##_t flat2,                   \
-      const uint32_t flat2_status, const uint8x##w##_t hev,                    \
-      const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5,  \
-      const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2,  \
-      const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0,  \
-      const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3,  \
-      const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6,  \
-      const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5,          \
-      uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2,              \
-      uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0,              \
-      uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3,              \
-      uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6) {            \
-    if (flat_status != (uint32_t)-2) {                                         \
-      filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1);              \
-    }                                                                          \
-                                                                               \
-    if (flat_status) {                                                         \
-      *op2 = p2;                                                               \
-      *oq2 = q2;                                                               \
-      if (flat2_status != (uint32_t)-2) {                                      \
-        apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
-                               op0, oq0, oq1, oq2);                            \
-      }                                                                        \
-      if (flat2_status) {                                                      \
-        apply_15_tap_filter_##w(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, \
-                                q2, q3, q4, q5, q6, q7, op6, op5, op4, op3,    \
-                                op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5,   \
-                                oq6);                                          \
-      }                                                                        \
-    }                                                                          \
-  }
-
-FUN_FILTER16(8)   // filter16_8
-FUN_FILTER16(16)  // filter16_16
-#undef FUN_FILTER16
-
-#define FUN_LOAD8(w, r)                                                    \
-  static INLINE void load_##w##x8(                                         \
-      const uint8_t *s, const int p, uint8x##w##_t *p3, uint8x##w##_t *p2, \
-      uint8x##w##_t *p1, uint8x##w##_t *p0, uint8x##w##_t *q0,             \
-      uint8x##w##_t *q1, uint8x##w##_t *q2, uint8x##w##_t *q3) {           \
-    *p3 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *p2 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *p1 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *p0 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *q0 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *q1 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *q2 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *q3 = vld1##r##u8(s);                                                  \
-  }
-
-FUN_LOAD8(8, _)    // load_8x8
-FUN_LOAD8(16, q_)  // load_16x8
-#undef FUN_LOAD8
-
-#define FUN_LOAD16(w, r)                                                   \
-  static INLINE void load_##w##x16(                                        \
-      const uint8_t *s, const int p, uint8x##w##_t *s0, uint8x##w##_t *s1, \
-      uint8x##w##_t *s2, uint8x##w##_t *s3, uint8x##w##_t *s4,             \
-      uint8x##w##_t *s5, uint8x##w##_t *s6, uint8x##w##_t *s7,             \
-      uint8x##w##_t *s8, uint8x##w##_t *s9, uint8x##w##_t *s10,            \
-      uint8x##w##_t *s11, uint8x##w##_t *s12, uint8x##w##_t *s13,          \
-      uint8x##w##_t *s14, uint8x##w##_t *s15) {                            \
-    *s0 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *s1 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *s2 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *s3 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *s4 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *s5 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *s6 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *s7 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *s8 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *s9 = vld1##r##u8(s);                                                  \
-    s += p;                                                                \
-    *s10 = vld1##r##u8(s);                                                 \
-    s += p;                                                                \
-    *s11 = vld1##r##u8(s);                                                 \
-    s += p;                                                                \
-    *s12 = vld1##r##u8(s);                                                 \
-    s += p;                                                                \
-    *s13 = vld1##r##u8(s);                                                 \
-    s += p;                                                                \
-    *s14 = vld1##r##u8(s);                                                 \
-    s += p;                                                                \
-    *s15 = vld1##r##u8(s);                                                 \
-  }
-
-FUN_LOAD16(8, _)    // load_8x16
-FUN_LOAD16(16, q_)  // load_16x16
-#undef FUN_LOAD16
-
-#define FUN_STORE4(w, r)                                                       \
-  static INLINE void store_##w##x4(                                            \
-      uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
-      const uint8x##w##_t s2, const uint8x##w##_t s3) {                        \
-    vst1##r##u8(s, s0);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s1);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s2);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s3);                                                        \
-  }
-
-FUN_STORE4(8, _)    // store_8x4
-FUN_STORE4(16, q_)  // store_16x4
-#undef FUN_STORE4
-
-#define FUN_STORE6(w, r)                                                       \
-  static INLINE void store_##w##x6(                                            \
-      uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
-      const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4,  \
-      const uint8x##w##_t s5) {                                                \
-    vst1##r##u8(s, s0);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s1);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s2);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s3);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s4);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s5);                                                        \
-  }
-
-FUN_STORE6(8, _)    // store_8x6
-FUN_STORE6(16, q_)  // store_16x6
-#undef FUN_STORE6
-
-static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
-                             const uint8x8_t p0, const uint8x8_t q0,
-                             const uint8x8_t q1) {
-  uint8x8x4_t o;
-
-  o.val[0] = p1;
-  o.val[1] = p0;
-  o.val[2] = q0;
-  o.val[3] = q1;
-  vst4_lane_u8(s, o, 0);
-  s += p;
-  vst4_lane_u8(s, o, 1);
-  s += p;
-  vst4_lane_u8(s, o, 2);
-  s += p;
-  vst4_lane_u8(s, o, 3);
-  s += p;
-  vst4_lane_u8(s, o, 4);
-  s += p;
-  vst4_lane_u8(s, o, 5);
-  s += p;
-  vst4_lane_u8(s, o, 6);
-  s += p;
-  vst4_lane_u8(s, o, 7);
-}
-
-static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
-                             const uint8x8_t s1, const uint8x8_t s2,
-                             const uint8x8_t s3, const uint8x8_t s4,
-                             const uint8x8_t s5) {
-  uint8x8x3_t o0, o1;
-
-  o0.val[0] = s0;
-  o0.val[1] = s1;
-  o0.val[2] = s2;
-  o1.val[0] = s3;
-  o1.val[1] = s4;
-  o1.val[2] = s5;
-  vst3_lane_u8(s - 3, o0, 0);
-  vst3_lane_u8(s + 0, o1, 0);
-  s += p;
-  vst3_lane_u8(s - 3, o0, 1);
-  vst3_lane_u8(s + 0, o1, 1);
-  s += p;
-  vst3_lane_u8(s - 3, o0, 2);
-  vst3_lane_u8(s + 0, o1, 2);
-  s += p;
-  vst3_lane_u8(s - 3, o0, 3);
-  vst3_lane_u8(s + 0, o1, 3);
-  s += p;
-  vst3_lane_u8(s - 3, o0, 4);
-  vst3_lane_u8(s + 0, o1, 4);
-  s += p;
-  vst3_lane_u8(s - 3, o0, 5);
-  vst3_lane_u8(s + 0, o1, 5);
-  s += p;
-  vst3_lane_u8(s - 3, o0, 6);
-  vst3_lane_u8(s + 0, o1, 6);
-  s += p;
-  vst3_lane_u8(s - 3, o0, 7);
-  vst3_lane_u8(s + 0, o1, 7);
-}
-
-#define FUN_STORE8(w, r)                                                       \
-  static INLINE void store_##w##x8(                                            \
-      uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
-      const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4,  \
-      const uint8x##w##_t s5, const uint8x##w##_t s6,                          \
-      const uint8x##w##_t s7) {                                                \
-    vst1##r##u8(s, s0);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s1);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s2);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s3);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s4);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s5);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s6);                                                        \
-    s += p;                                                                    \
-    vst1##r##u8(s, s7);                                                        \
-  }
-
-FUN_STORE8(8, _)    // store_8x8
-FUN_STORE8(16, q_)  // store_16x8
-#undef FUN_STORE8
-
-#define FUN_STORE14(w, r)                                                      \
-  static INLINE void store_##w##x14(                                           \
-      uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
-      const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2,  \
-      const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0,  \
-      const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3,  \
-      const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6,  \
-      const uint32_t flat_status, const uint32_t flat2_status) {               \
-    if (flat_status) {                                                         \
-      if (flat2_status) {                                                      \
-        vst1##r##u8(s - 7 * p, p6);                                            \
-        vst1##r##u8(s - 6 * p, p5);                                            \
-        vst1##r##u8(s - 5 * p, p4);                                            \
-        vst1##r##u8(s - 4 * p, p3);                                            \
-        vst1##r##u8(s + 3 * p, q3);                                            \
-        vst1##r##u8(s + 4 * p, q4);                                            \
-        vst1##r##u8(s + 5 * p, q5);                                            \
-        vst1##r##u8(s + 6 * p, q6);                                            \
-      }                                                                        \
-      vst1##r##u8(s - 3 * p, p2);                                              \
-      vst1##r##u8(s + 2 * p, q2);                                              \
-    }                                                                          \
-    vst1##r##u8(s - 2 * p, p1);                                                \
-    vst1##r##u8(s - 1 * p, p0);                                                \
-    vst1##r##u8(s + 0 * p, q0);                                                \
-    vst1##r##u8(s + 1 * p, q1);                                                \
-  }
-
-FUN_STORE14(8, _)    // store_8x14
-FUN_STORE14(16, q_)  // store_16x14
-#undef FUN_STORE14
-
-static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
-                               const uint8x16_t s1, const uint8x16_t s2,
-                               const uint8x16_t s3, const uint8x16_t s4,
-                               const uint8x16_t s5, const uint8x16_t s6,
-                               const uint8x16_t s7, const uint8x16_t s8,
-                               const uint8x16_t s9, const uint8x16_t s10,
-                               const uint8x16_t s11, const uint8x16_t s12,
-                               const uint8x16_t s13, const uint8x16_t s14,
-                               const uint8x16_t s15) {
-  vst1q_u8(s, s0);
-  s += p;
-  vst1q_u8(s, s1);
-  s += p;
-  vst1q_u8(s, s2);
-  s += p;
-  vst1q_u8(s, s3);
-  s += p;
-  vst1q_u8(s, s4);
-  s += p;
-  vst1q_u8(s, s5);
-  s += p;
-  vst1q_u8(s, s6);
-  s += p;
-  vst1q_u8(s, s7);
-  s += p;
-  vst1q_u8(s, s8);
-  s += p;
-  vst1q_u8(s, s9);
-  s += p;
-  vst1q_u8(s, s10);
-  s += p;
-  vst1q_u8(s, s11);
-  s += p;
-  vst1q_u8(s, s12);
-  s += p;
-  vst1q_u8(s, s13);
-  s += p;
-  vst1q_u8(s, s14);
-  s += p;
-  vst1q_u8(s, s15);
-}
-
-#define FUN_HOR_4_KERNEL(name, w)                                           \
-  static INLINE void lpf_horizontal_4##name##kernel(                        \
-      uint8_t *s, const int p, const uint8x##w##_t blimit,                  \
-      const uint8x##w##_t limit, const uint8x##w##_t thresh) {              \
-    uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev;                \
-                                                                            \
-    load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);     \
-    filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \
-                         q3, &hev, &mask);                                  \
-    filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);             \
-    store_##w##x4(s - 2 * p, p, p1, p0, q0, q1);                            \
-  }
-
-FUN_HOR_4_KERNEL(_, 8)        // lpf_horizontal_4_kernel
-FUN_HOR_4_KERNEL(_dual_, 16)  // lpf_horizontal_4_dual_kernel
-#undef FUN_HOR_4_KERNEL
-
-void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit,
-                               const uint8_t *limit, const uint8_t *thresh) {
-  uint8x8_t blimit_vec, limit_vec, thresh_vec;
-  load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
-  lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
-}
-
-void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                    const uint8_t *limit0,
-                                    const uint8_t *thresh0,
-                                    const uint8_t *blimit1,
-                                    const uint8_t *limit1,
-                                    const uint8_t *thresh1) {
-  uint8x16_t blimit_vec, limit_vec, thresh_vec;
-  load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
-                     &blimit_vec, &limit_vec, &thresh_vec);
-  lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
-}
-
-void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh) {
-  uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
-      mask, hev;
-  load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
-  load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
-  transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
-  filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
-                     q2, q3, &hev, &mask);
-  filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
-  store_4x8(s - 2, p, p1, p0, q0, q1);
-}
-
-void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
-      mask, hev;
-  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
-      s15;
-
-  load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
-                     &blimit_vec, &limit_vec, &thresh_vec);
-  load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
-            &s11, &s12, &s13, &s14, &s15);
-  transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
-                    s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
-  filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
-                      q2, q3, &hev, &mask);
-  filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
-  s -= 2;
-  store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0),
-            vget_low_u8(q1));
-  store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0),
-            vget_high_u8(q1));
-}
-
-void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit,
-                               const uint8_t *limit, const uint8_t *thresh) {
-  uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
-      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
-  uint32_t flat_status;
-
-  load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
-  load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
-  mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
-                                p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
-  filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
-            &op1, &op0, &oq0, &oq1, &oq2);
-  store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
-}
-
-void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                    const uint8_t *limit0,
-                                    const uint8_t *thresh0,
-                                    const uint8_t *blimit1,
-                                    const uint8_t *limit1,
-                                    const uint8_t *thresh1) {
-  uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
-      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
-  uint32_t flat_status;
-
-  load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
-                     &blimit_vec, &limit_vec, &thresh_vec);
-  load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
-  mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
-                                 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
-  filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
-             &op1, &op0, &oq0, &oq1, &oq2);
-  store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
-}
-
-void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh) {
-  uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
-      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
-  uint32_t flat_status;
-
-  load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
-  load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
-  transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
-  mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
-                                p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
-  filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
-            &op1, &op0, &oq0, &oq1, &oq2);
-  // Note: tranpose + store_8x8() is faster than store_6x8().
-  transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
-  store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
-}
-
-void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
-      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
-  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
-      s15;
-  uint32_t flat_status;
-
-  load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
-                     &blimit_vec, &limit_vec, &thresh_vec);
-  load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
-            &s11, &s12, &s13, &s14, &s15);
-  transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
-                    s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
-  mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
-                                 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
-  filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
-             &op1, &op0, &oq0, &oq1, &oq2);
-  // Note: store_6x8() twice is faster than tranpose + store_8x16().
-  store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
-            vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
-  store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
-            vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
-            vget_high_u8(oq2));
-}
-
-#define FUN_LPF_16_KERNEL(name, w)                                             \
-  static INLINE void lpf_16##name##kernel(                                     \
-      const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh,      \
-      const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5,  \
-      const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2,  \
-      const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0,  \
-      const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3,  \
-      const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6,  \
-      const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5,          \
-      uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2,              \
-      uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0,              \
-      uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3,              \
-      uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6,              \
-      uint32_t *flat_status, uint32_t *flat2_status) {                         \
-    uint8x##w##_t blimit_vec, limit_vec, thresh_vec, mask, flat, flat2, hev;   \
-                                                                               \
-    load_thresh_##w(blimit, limit, thresh, &blimit_vec, &limit_vec,            \
-                    &thresh_vec);                                              \
-    mask = filter_flat_hev_mask_##w(limit_vec, blimit_vec, thresh_vec, p3, p2, \
-                                    p1, p0, q0, q1, q2, q3, &flat,             \
-                                    flat_status, &hev);                        \
-    flat2 = flat_mask5_##w(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,       \
-                           flat2_status);                                      \
-    filter16_##w(mask, flat, *flat_status, flat2, *flat2_status, hev, p7, p6,  \
-                 p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,  \
-                 op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5,   \
-                 oq6);                                                         \
-  }
-
-FUN_LPF_16_KERNEL(_, 8)        // lpf_16_kernel
-FUN_LPF_16_KERNEL(_dual_, 16)  // lpf_16_dual_kernel
-#undef FUN_LPF_16_KERNEL
-
-void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit,
-                                    const uint8_t *thresh) {
-  uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
-      op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
-  uint32_t flat_status, flat2_status;
-
-  load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2,
-            &q3, &q4, &q5, &q6, &q7);
-  lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1,
-                q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1,
-                &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status,
-                &flat2_status);
-  store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
-             oq5, oq6, flat_status, flat2_status);
-}
-
-void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int p, const uint8_t *blimit,
-                                     const uint8_t *limit,
-                                     const uint8_t *thresh) {
-  uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
-      op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
-  uint32_t flat_status, flat2_status;
-
-  load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
-  p7 = vld1q_u8(s - 8 * p);
-  p6 = vld1q_u8(s - 7 * p);
-  p5 = vld1q_u8(s - 6 * p);
-  p4 = vld1q_u8(s - 5 * p);
-  q4 = vld1q_u8(s + 4 * p);
-  q5 = vld1q_u8(s + 5 * p);
-  q6 = vld1q_u8(s + 6 * p);
-  q7 = vld1q_u8(s + 7 * p);
-  lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0,
-                     q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2,
-                     &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
-                     &flat_status, &flat2_status);
-  store_16x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
-              oq5, oq6, flat_status, flat2_status);
-}
-
-void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
-                              const uint8_t *limit, const uint8_t *thresh) {
-  uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
-      op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
-  uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
-  uint32_t flat_status, flat2_status;
-
-  s -= 8;
-  load_16x8(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-  transpose_u8_16x8(s0, s1, s2, s3, s4, s5, s6, s7, &p7, &p6, &p5, &p4, &p3,
-                    &p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
-  lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1,
-                q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1,
-                &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status,
-                &flat2_status);
-  if (flat_status) {
-    if (flat2_status) {
-      transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
-                        oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5,
-                        &s6, &s7);
-      store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
-    } else {
-      // Note: tranpose + store_8x8() is faster than store_6x8().
-      transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
-      store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
-    }
-  } else {
-    store_4x8(s + 6, p, op1, op0, oq0, oq1);
-  }
-}
-
-void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit,
-                                   const uint8_t *thresh) {
-  uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
-      op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
-  uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
-      s15;
-  uint32_t flat_status, flat2_status;
-
-  s -= 8;
-  load_16x16(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, &s11,
-             &s12, &s13, &s14, &s15);
-  transpose_u8_16x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
-                     s14, s15, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1,
-                     &q2, &q3, &q4, &q5, &q6, &q7);
-  lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0,
-                     q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2,
-                     &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
-                     &flat_status, &flat2_status);
-  if (flat_status) {
-    if (flat2_status) {
-      transpose_u8_16x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
-                         oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5,
-                         &s6, &s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14,
-                         &s15);
-      store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
-                  s13, s14, s15);
-    } else {
-      // Note: store_6x8() twice is faster than tranpose + store_8x16().
-      s += 8;
-      store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
-                vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
-      store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
-                vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
-                vget_high_u8(oq2));
-    }
-  } else {
-    s += 6;
-    store_4x8(s, p, vget_low_u8(op1), vget_low_u8(op0), vget_low_u8(oq0),
-              vget_low_u8(oq1));
-    store_4x8(s + 8 * p, p, vget_high_u8(op1), vget_high_u8(op0),
-              vget_high_u8(oq0), vget_high_u8(oq1));
-  }
-}
diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_neon.c b/libs/libvpx/vpx_dsp/arm/loopfilter_neon.c
index ced5aef0ab..7419cea022 100644
--- a/libs/libvpx/vpx_dsp/arm/loopfilter_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/loopfilter_neon.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -9,15 +9,1084 @@
  */
 
 #include <arm_neon.h>
-
-#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+// For all the static inline functions, the functions ending with '_8' process
+// 8 samples in a bunch, and the functions ending with '_16' process 16 samples
+// in a bunch.
+
+#define FUN_LOAD_THRESH(w, r)                                             \
+  static INLINE void load_thresh_##w(                                     \
+      const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \
+      uint8x##w##_t *blimit_vec, uint8x##w##_t *limit_vec,                \
+      uint8x##w##_t *thresh_vec) {                                        \
+    *blimit_vec = vld1##r##dup_u8(blimit);                                \
+    *limit_vec = vld1##r##dup_u8(limit);                                  \
+    *thresh_vec = vld1##r##dup_u8(thresh);                                \
+  }
+
+FUN_LOAD_THRESH(8, _)    // load_thresh_8
+FUN_LOAD_THRESH(16, q_)  // load_thresh_16
+#undef FUN_LOAD_THRESH
+
+static INLINE void load_thresh_8_dual(
+    const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0,
+    const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,
+    uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) {
+  *blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1));
+  *limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1));
+  *thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1));
+}
+
+// Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a
+// pixel. When used to control filter branches, we only detect whether it is all
+// 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
+// flat equals 0 if and only if flat_status equals 0.
+// flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true
+// because each mask occupies more than 1 bit.)
+static INLINE uint32_t calc_flat_status_8(uint8x8_t flat) {
+  return vget_lane_u32(
+      vreinterpret_u32_u64(vpaddl_u32(vreinterpret_u32_u8(flat))), 0);
+}
+
+// Here flat is 128-bit long, with each 8-bit chunk being a mask of a pixel.
+// When used to control filter branches, we only detect whether it is all 0s or
+// all 1s. We narrowing shift right each 16-bit chunk by 4 arithmetically, so
+// we get a 64-bit long number, with each 4-bit chunk being a mask of a pixel.
+// Then we pairwise add flat to a 32-bit long number flat_status.
+// flat equals 0 if and only if flat_status equals 0.
+// flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true
+// because each mask occupies more than 1 bit.)
+static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) {
+  const uint8x8_t flat_4bit =
+      vreinterpret_u8_s8(vshrn_n_s16(vreinterpretq_s16_u8(flat), 4));
+  return calc_flat_status_8(flat_4bit);
+}
+
+#define FUN_FILTER_HEV_MASK4(w, r)                                            \
+  static INLINE uint8x##w##_t filter_hev_mask4_##w(                           \
+      const uint8x##w##_t limit, const uint8x##w##_t blimit,                  \
+      const uint8x##w##_t thresh, const uint8x##w##_t p3,                     \
+      const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
+      const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
+      const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) {      \
+    uint8x##w##_t max, t0, t1;                                                \
+                                                                              \
+    max = vabd##r##u8(p1, p0);                                                \
+    max = vmax##r##u8(max, vabd##r##u8(q1, q0));                              \
+    *hev = vcgt##r##u8(max, thresh);                                          \
+    *mask = vmax##r##u8(max, vabd##r##u8(p3, p2));                            \
+    *mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1));                          \
+    *mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1));                          \
+    *mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2));                          \
+    t0 = vabd##r##u8(p0, q0);                                                 \
+    t1 = vabd##r##u8(p1, q1);                                                 \
+    t0 = vqadd##r##u8(t0, t0);                                                \
+    t1 = vshr##r##n_u8(t1, 1);                                                \
+    t0 = vqadd##r##u8(t0, t1);                                                \
+    *mask = vcle##r##u8(*mask, limit);                                        \
+    t0 = vcle##r##u8(t0, blimit);                                             \
+    *mask = vand##r##u8(*mask, t0);                                           \
+                                                                              \
+    return max;                                                               \
+  }
+
+FUN_FILTER_HEV_MASK4(8, _)    // filter_hev_mask4_8
+FUN_FILTER_HEV_MASK4(16, q_)  // filter_hev_mask4_16
+#undef FUN_FILTER_HEV_MASK4
+
+#define FUN_FILTER_FLAT_HEV_MASK(w, r)                                        \
+  static INLINE uint8x##w##_t filter_flat_hev_mask_##w(                       \
+      const uint8x##w##_t limit, const uint8x##w##_t blimit,                  \
+      const uint8x##w##_t thresh, const uint8x##w##_t p3,                     \
+      const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
+      const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
+      const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status,     \
+      uint8x##w##_t *hev) {                                                   \
+    uint8x##w##_t max, mask;                                                  \
+                                                                              \
+    max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \
+                               q2, q3, hev, &mask);                           \
+    *flat = vmax##r##u8(max, vabd##r##u8(p2, p0));                            \
+    *flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0));                          \
+    *flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0));                          \
+    *flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0));                          \
+    *flat = vcle##r##u8(*flat, vdup##r##n_u8(1)); /* flat_mask4() */          \
+    *flat = vand##r##u8(*flat, mask);                                         \
+    *flat_status = calc_flat_status_##w(*flat);                               \
+                                                                              \
+    return mask;                                                              \
+  }
+
+FUN_FILTER_FLAT_HEV_MASK(8, _)    // filter_flat_hev_mask_8
+FUN_FILTER_FLAT_HEV_MASK(16, q_)  // filter_flat_hev_mask_16
+#undef FUN_FILTER_FLAT_HEV_MASK
+
+#define FUN_FLAT_MASK5(w, r)                                                  \
+  static INLINE uint8x##w##_t flat_mask5_##w(                                 \
+      const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
+      const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
+      const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
+      const uint8x##w##_t q4, const uint8x##w##_t flat,                       \
+      uint32_t *flat2_status) {                                               \
+    uint8x##w##_t flat2 = vabd##r##u8(p4, p0);                                \
+    flat2 = vmax##r##u8(flat2, vabd##r##u8(p3, p0));                          \
+    flat2 = vmax##r##u8(flat2, vabd##r##u8(p2, p0));                          \
+    flat2 = vmax##r##u8(flat2, vabd##r##u8(p1, p0));                          \
+    flat2 = vmax##r##u8(flat2, vabd##r##u8(q1, q0));                          \
+    flat2 = vmax##r##u8(flat2, vabd##r##u8(q2, q0));                          \
+    flat2 = vmax##r##u8(flat2, vabd##r##u8(q3, q0));                          \
+    flat2 = vmax##r##u8(flat2, vabd##r##u8(q4, q0));                          \
+    flat2 = vcle##r##u8(flat2, vdup##r##n_u8(1));                             \
+    flat2 = vand##r##u8(flat2, flat);                                         \
+    *flat2_status = calc_flat_status_##w(flat2);                              \
+                                                                              \
+    return flat2;                                                             \
+  }
+
+FUN_FLAT_MASK5(8, _)    // flat_mask5_8
+FUN_FLAT_MASK5(16, q_)  // flat_mask5_16
+#undef FUN_FLAT_MASK5
+
+#define FUN_FLIP_SIGN(w, r)                                         \
+  static INLINE int8x##w##_t flip_sign_##w(const uint8x##w##_t v) { \
+    const uint8x##w##_t sign_bit = vdup##r##n_u8(0x80);             \
+    return vreinterpret##r##s8_u8(veor##r##u8(v, sign_bit));        \
+  }
+
+FUN_FLIP_SIGN(8, _)    // flip_sign_8
+FUN_FLIP_SIGN(16, q_)  // flip_sign_16
+#undef FUN_FLIP_SIGN
+
+#define FUN_FLIP_SIGN_BACK(w, r)                                         \
+  static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \
+    const int8x##w##_t sign_bit = vdup##r##n_s8(0x80);                   \
+    return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit));             \
+  }
+
+FUN_FLIP_SIGN_BACK(8, _)    // flip_sign_back_8
+FUN_FLIP_SIGN_BACK(16, q_)  // flip_sign_back_16
+#undef FUN_FLIP_SIGN_BACK
+
+static INLINE void filter_update_8(const uint8x8_t sub0, const uint8x8_t sub1,
+                                   const uint8x8_t add0, const uint8x8_t add1,
+                                   uint16x8_t *sum) {
+  *sum = vsubw_u8(*sum, sub0);
+  *sum = vsubw_u8(*sum, sub1);
+  *sum = vaddw_u8(*sum, add0);
+  *sum = vaddw_u8(*sum, add1);
+}
+
+static INLINE void filter_update_16(const uint8x16_t sub0,
+                                    const uint8x16_t sub1,
+                                    const uint8x16_t add0,
+                                    const uint8x16_t add1, uint16x8_t *sum0,
+                                    uint16x8_t *sum1) {
+  *sum0 = vsubw_u8(*sum0, vget_low_u8(sub0));
+  *sum1 = vsubw_u8(*sum1, vget_high_u8(sub0));
+  *sum0 = vsubw_u8(*sum0, vget_low_u8(sub1));
+  *sum1 = vsubw_u8(*sum1, vget_high_u8(sub1));
+  *sum0 = vaddw_u8(*sum0, vget_low_u8(add0));
+  *sum1 = vaddw_u8(*sum1, vget_high_u8(add0));
+  *sum0 = vaddw_u8(*sum0, vget_low_u8(add1));
+  *sum1 = vaddw_u8(*sum1, vget_high_u8(add1));
+}
+
+static INLINE uint8x8_t calc_7_tap_filter_8_kernel(const uint8x8_t sub0,
+                                                   const uint8x8_t sub1,
+                                                   const uint8x8_t add0,
+                                                   const uint8x8_t add1,
+                                                   uint16x8_t *sum) {
+  filter_update_8(sub0, sub1, add0, add1, sum);
+  return vrshrn_n_u16(*sum, 3);
+}
+
+static INLINE uint8x16_t calc_7_tap_filter_16_kernel(
+    const uint8x16_t sub0, const uint8x16_t sub1, const uint8x16_t add0,
+    const uint8x16_t add1, uint16x8_t *sum0, uint16x8_t *sum1) {
+  filter_update_16(sub0, sub1, add0, add1, sum0, sum1);
+  return vcombine_u8(vrshrn_n_u16(*sum0, 3), vrshrn_n_u16(*sum1, 3));
+}
+
+static INLINE uint8x8_t apply_15_tap_filter_8_kernel(
+    const uint8x8_t flat, const uint8x8_t sub0, const uint8x8_t sub1,
+    const uint8x8_t add0, const uint8x8_t add1, const uint8x8_t in,
+    uint16x8_t *sum) {
+  filter_update_8(sub0, sub1, add0, add1, sum);
+  return vbsl_u8(flat, vrshrn_n_u16(*sum, 4), in);
+}
+
+static INLINE uint8x16_t apply_15_tap_filter_16_kernel(
+    const uint8x16_t flat, const uint8x16_t sub0, const uint8x16_t sub1,
+    const uint8x16_t add0, const uint8x16_t add1, const uint8x16_t in,
+    uint16x8_t *sum0, uint16x8_t *sum1) {
+  uint8x16_t t;
+  filter_update_16(sub0, sub1, add0, add1, sum0, sum1);
+  t = vcombine_u8(vrshrn_n_u16(*sum0, 4), vrshrn_n_u16(*sum1, 4));
+  return vbslq_u8(flat, t, in);
+}
+
+// 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+static INLINE void calc_7_tap_filter_8(const uint8x8_t p3, const uint8x8_t p2,
+                                       const uint8x8_t p1, const uint8x8_t p0,
+                                       const uint8x8_t q0, const uint8x8_t q1,
+                                       const uint8x8_t q2, const uint8x8_t q3,
+                                       uint8x8_t *op2, uint8x8_t *op1,
+                                       uint8x8_t *op0, uint8x8_t *oq0,
+                                       uint8x8_t *oq1, uint8x8_t *oq2) {
+  uint16x8_t sum;
+  sum = vaddl_u8(p3, p3);   // 2*p3
+  sum = vaddw_u8(sum, p3);  // 3*p3
+  sum = vaddw_u8(sum, p2);  // 3*p3+p2
+  sum = vaddw_u8(sum, p2);  // 3*p3+2*p2
+  sum = vaddw_u8(sum, p1);  // 3*p3+2*p2+p1
+  sum = vaddw_u8(sum, p0);  // 3*p3+2*p2+p1+p0
+  sum = vaddw_u8(sum, q0);  // 3*p3+2*p2+p1+p0+q0
+  *op2 = vrshrn_n_u16(sum, 3);
+  *op1 = calc_7_tap_filter_8_kernel(p3, p2, p1, q1, &sum);
+  *op0 = calc_7_tap_filter_8_kernel(p3, p1, p0, q2, &sum);
+  *oq0 = calc_7_tap_filter_8_kernel(p3, p0, q0, q3, &sum);
+  *oq1 = calc_7_tap_filter_8_kernel(p2, q0, q1, q3, &sum);
+  *oq2 = calc_7_tap_filter_8_kernel(p1, q1, q2, q3, &sum);
+}
+
+static INLINE void calc_7_tap_filter_16(
+    const uint8x16_t p3, const uint8x16_t p2, const uint8x16_t p1,
+    const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1,
+    const uint8x16_t q2, const uint8x16_t q3, uint8x16_t *op2, uint8x16_t *op1,
+    uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2) {
+  uint16x8_t sum0, sum1;
+  sum0 = vaddl_u8(vget_low_u8(p3), vget_low_u8(p3));    // 2*p3
+  sum1 = vaddl_u8(vget_high_u8(p3), vget_high_u8(p3));  // 2*p3
+  sum0 = vaddw_u8(sum0, vget_low_u8(p3));               // 3*p3
+  sum1 = vaddw_u8(sum1, vget_high_u8(p3));              // 3*p3
+  sum0 = vaddw_u8(sum0, vget_low_u8(p2));               // 3*p3+p2
+  sum1 = vaddw_u8(sum1, vget_high_u8(p2));              // 3*p3+p2
+  sum0 = vaddw_u8(sum0, vget_low_u8(p2));               // 3*p3+2*p2
+  sum1 = vaddw_u8(sum1, vget_high_u8(p2));              // 3*p3+2*p2
+  sum0 = vaddw_u8(sum0, vget_low_u8(p1));               // 3*p3+2*p2+p1
+  sum1 = vaddw_u8(sum1, vget_high_u8(p1));              // 3*p3+2*p2+p1
+  sum0 = vaddw_u8(sum0, vget_low_u8(p0));               // 3*p3+2*p2+p1+p0
+  sum1 = vaddw_u8(sum1, vget_high_u8(p0));              // 3*p3+2*p2+p1+p0
+  sum0 = vaddw_u8(sum0, vget_low_u8(q0));               // 3*p3+2*p2+p1+p0+q0
+  sum1 = vaddw_u8(sum1, vget_high_u8(q0));              // 3*p3+2*p2+p1+p0+q0
+  *op2 = vcombine_u8(vrshrn_n_u16(sum0, 3), vrshrn_n_u16(sum1, 3));
+  *op1 = calc_7_tap_filter_16_kernel(p3, p2, p1, q1, &sum0, &sum1);
+  *op0 = calc_7_tap_filter_16_kernel(p3, p1, p0, q2, &sum0, &sum1);
+  *oq0 = calc_7_tap_filter_16_kernel(p3, p0, q0, q3, &sum0, &sum1);
+  *oq1 = calc_7_tap_filter_16_kernel(p2, q0, q1, q3, &sum0, &sum1);
+  *oq2 = calc_7_tap_filter_16_kernel(p1, q1, q2, q3, &sum0, &sum1);
+}
+
+#define FUN_APPLY_7_TAP_FILTER(w, r)                                          \
+  static INLINE void apply_7_tap_filter_##w(                                  \
+      const uint8x##w##_t flat, const uint8x##w##_t p3,                       \
+      const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
+      const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
+      const uint8x##w##_t q3, uint8x##w##_t *op2, uint8x##w##_t *op1,         \
+      uint8x##w##_t *op0, uint8x##w##_t *oq0, uint8x##w##_t *oq1,             \
+      uint8x##w##_t *oq2) {                                                   \
+    uint8x##w##_t tp1, tp0, tq0, tq1;                                         \
+    calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0,    \
+                          &tq0, &tq1, oq2);                                   \
+    *op2 = vbsl##r##u8(flat, *op2, p2);                                       \
+    *op1 = vbsl##r##u8(flat, tp1, *op1);                                      \
+    *op0 = vbsl##r##u8(flat, tp0, *op0);                                      \
+    *oq0 = vbsl##r##u8(flat, tq0, *oq0);                                      \
+    *oq1 = vbsl##r##u8(flat, tq1, *oq1);                                      \
+    *oq2 = vbsl##r##u8(flat, *oq2, q2);                                       \
+  }
+
+FUN_APPLY_7_TAP_FILTER(8, _)    // apply_7_tap_filter_8
+FUN_APPLY_7_TAP_FILTER(16, q_)  // apply_7_tap_filter_16
+#undef FUN_APPLY_7_TAP_FILTER
+
+// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+static INLINE void apply_15_tap_filter_8(
+    const uint8x8_t flat2, const uint8x8_t p7, const uint8x8_t p6,
+    const uint8x8_t p5, const uint8x8_t p4, const uint8x8_t p3,
+    const uint8x8_t p2, const uint8x8_t p1, const uint8x8_t p0,
+    const uint8x8_t q0, const uint8x8_t q1, const uint8x8_t q2,
+    const uint8x8_t q3, const uint8x8_t q4, const uint8x8_t q5,
+    const uint8x8_t q6, const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5,
+    uint8x8_t *op4, uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1,
+    uint8x8_t *op0, uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2,
+    uint8x8_t *oq3, uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) {
+  uint16x8_t sum;
+  sum = vshll_n_u8(p7, 3);  // 8*p7
+  sum = vsubw_u8(sum, p7);  // 7*p7
+  sum = vaddw_u8(sum, p6);  // 7*p7+p6
+  sum = vaddw_u8(sum, p6);  // 7*p7+2*p6
+  sum = vaddw_u8(sum, p5);  // 7*p7+2*p6+p5
+  sum = vaddw_u8(sum, p4);  // 7*p7+2*p6+p5+p4
+  sum = vaddw_u8(sum, p3);  // 7*p7+2*p6+p5+p4+p3
+  sum = vaddw_u8(sum, p2);  // 7*p7+2*p6+p5+p4+p3+p2
+  sum = vaddw_u8(sum, p1);  // 7*p7+2*p6+p5+p4+p3+p2+p1
+  sum = vaddw_u8(sum, p0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
+  sum = vaddw_u8(sum, q0);  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
+  *op6 = vbsl_u8(flat2, vrshrn_n_u16(sum, 4), p6);
+  *op5 = apply_15_tap_filter_8_kernel(flat2, p7, p6, p5, q1, p5, &sum);
+  *op4 = apply_15_tap_filter_8_kernel(flat2, p7, p5, p4, q2, p4, &sum);
+  *op3 = apply_15_tap_filter_8_kernel(flat2, p7, p4, p3, q3, p3, &sum);
+  *op2 = apply_15_tap_filter_8_kernel(flat2, p7, p3, p2, q4, *op2, &sum);
+  *op1 = apply_15_tap_filter_8_kernel(flat2, p7, p2, p1, q5, *op1, &sum);
+  *op0 = apply_15_tap_filter_8_kernel(flat2, p7, p1, p0, q6, *op0, &sum);
+  *oq0 = apply_15_tap_filter_8_kernel(flat2, p7, p0, q0, q7, *oq0, &sum);
+  *oq1 = apply_15_tap_filter_8_kernel(flat2, p6, q0, q1, q7, *oq1, &sum);
+  *oq2 = apply_15_tap_filter_8_kernel(flat2, p5, q1, q2, q7, *oq2, &sum);
+  *oq3 = apply_15_tap_filter_8_kernel(flat2, p4, q2, q3, q7, q3, &sum);
+  *oq4 = apply_15_tap_filter_8_kernel(flat2, p3, q3, q4, q7, q4, &sum);
+  *oq5 = apply_15_tap_filter_8_kernel(flat2, p2, q4, q5, q7, q5, &sum);
+  *oq6 = apply_15_tap_filter_8_kernel(flat2, p1, q5, q6, q7, q6, &sum);
+}
+
+static INLINE void apply_15_tap_filter_16(
+    const uint8x16_t flat2, const uint8x16_t p7, const uint8x16_t p6,
+    const uint8x16_t p5, const uint8x16_t p4, const uint8x16_t p3,
+    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
+    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
+    const uint8x16_t q3, const uint8x16_t q4, const uint8x16_t q5,
+    const uint8x16_t q6, const uint8x16_t q7, uint8x16_t *op6, uint8x16_t *op5,
+    uint8x16_t *op4, uint8x16_t *op3, uint8x16_t *op2, uint8x16_t *op1,
+    uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2,
+    uint8x16_t *oq3, uint8x16_t *oq4, uint8x16_t *oq5, uint8x16_t *oq6) {
+  uint16x8_t sum0, sum1;
+  uint8x16_t t;
+  sum0 = vshll_n_u8(vget_low_u8(p7), 3);    // 8*p7
+  sum1 = vshll_n_u8(vget_high_u8(p7), 3);   // 8*p7
+  sum0 = vsubw_u8(sum0, vget_low_u8(p7));   // 7*p7
+  sum1 = vsubw_u8(sum1, vget_high_u8(p7));  // 7*p7
+  sum0 = vaddw_u8(sum0, vget_low_u8(p6));   // 7*p7+p6
+  sum1 = vaddw_u8(sum1, vget_high_u8(p6));  // 7*p7+p6
+  sum0 = vaddw_u8(sum0, vget_low_u8(p6));   // 7*p7+2*p6
+  sum1 = vaddw_u8(sum1, vget_high_u8(p6));  // 7*p7+2*p6
+  sum0 = vaddw_u8(sum0, vget_low_u8(p5));   // 7*p7+2*p6+p5
+  sum1 = vaddw_u8(sum1, vget_high_u8(p5));  // 7*p7+2*p6+p5
+  sum0 = vaddw_u8(sum0, vget_low_u8(p4));   // 7*p7+2*p6+p5+p4
+  sum1 = vaddw_u8(sum1, vget_high_u8(p4));  // 7*p7+2*p6+p5+p4
+  sum0 = vaddw_u8(sum0, vget_low_u8(p3));   // 7*p7+2*p6+p5+p4+p3
+  sum1 = vaddw_u8(sum1, vget_high_u8(p3));  // 7*p7+2*p6+p5+p4+p3
+  sum0 = vaddw_u8(sum0, vget_low_u8(p2));   // 7*p7+2*p6+p5+p4+p3+p2
+  sum1 = vaddw_u8(sum1, vget_high_u8(p2));  // 7*p7+2*p6+p5+p4+p3+p2
+  sum0 = vaddw_u8(sum0, vget_low_u8(p1));   // 7*p7+2*p6+p5+p4+p3+p2+p1
+  sum1 = vaddw_u8(sum1, vget_high_u8(p1));  // 7*p7+2*p6+p5+p4+p3+p2+p1
+  sum0 = vaddw_u8(sum0, vget_low_u8(p0));   // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
+  sum1 = vaddw_u8(sum1, vget_high_u8(p0));  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
+  sum0 = vaddw_u8(sum0, vget_low_u8(q0));   // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
+  sum1 = vaddw_u8(sum1, vget_high_u8(q0));  // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
+  t = vcombine_u8(vrshrn_n_u16(sum0, 4), vrshrn_n_u16(sum1, 4));
+  *op6 = vbslq_u8(flat2, t, p6);
+  *op5 = apply_15_tap_filter_16_kernel(flat2, p7, p6, p5, q1, p5, &sum0, &sum1);
+  *op4 = apply_15_tap_filter_16_kernel(flat2, p7, p5, p4, q2, p4, &sum0, &sum1);
+  *op3 = apply_15_tap_filter_16_kernel(flat2, p7, p4, p3, q3, p3, &sum0, &sum1);
+  *op2 =
+      apply_15_tap_filter_16_kernel(flat2, p7, p3, p2, q4, *op2, &sum0, &sum1);
+  *op1 =
+      apply_15_tap_filter_16_kernel(flat2, p7, p2, p1, q5, *op1, &sum0, &sum1);
+  *op0 =
+      apply_15_tap_filter_16_kernel(flat2, p7, p1, p0, q6, *op0, &sum0, &sum1);
+  *oq0 =
+      apply_15_tap_filter_16_kernel(flat2, p7, p0, q0, q7, *oq0, &sum0, &sum1);
+  *oq1 =
+      apply_15_tap_filter_16_kernel(flat2, p6, q0, q1, q7, *oq1, &sum0, &sum1);
+  *oq2 =
+      apply_15_tap_filter_16_kernel(flat2, p5, q1, q2, q7, *oq2, &sum0, &sum1);
+  *oq3 = apply_15_tap_filter_16_kernel(flat2, p4, q2, q3, q7, q3, &sum0, &sum1);
+  *oq4 = apply_15_tap_filter_16_kernel(flat2, p3, q3, q4, q7, q4, &sum0, &sum1);
+  *oq5 = apply_15_tap_filter_16_kernel(flat2, p2, q4, q5, q7, q5, &sum0, &sum1);
+  *oq6 = apply_15_tap_filter_16_kernel(flat2, p1, q5, q6, q7, q6, &sum0, &sum1);
+}
+
+#define FUN_FILTER4(w, r)                                                     \
+  static INLINE void filter4_##w(                                             \
+      const uint8x##w##_t mask, const uint8x##w##_t hev,                      \
+      const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
+      const uint8x##w##_t q1, uint8x##w##_t *op1, uint8x##w##_t *op0,         \
+      uint8x##w##_t *oq0, uint8x##w##_t *oq1) {                               \
+    int8x##w##_t filter, filter1, filter2, t;                                 \
+    int8x##w##_t ps1 = flip_sign_##w(p1);                                     \
+    int8x##w##_t ps0 = flip_sign_##w(p0);                                     \
+    int8x##w##_t qs0 = flip_sign_##w(q0);                                     \
+    int8x##w##_t qs1 = flip_sign_##w(q1);                                     \
+                                                                              \
+    /* add outer taps if we have high edge variance */                        \
+    filter = vqsub##r##s8(ps1, qs1);                                          \
+    filter = vand##r##s8(filter, vreinterpret##r##s8_u8(hev));                \
+    t = vqsub##r##s8(qs0, ps0);                                               \
+                                                                              \
+    /* inner taps */                                                          \
+    filter = vqadd##r##s8(filter, t);                                         \
+    filter = vqadd##r##s8(filter, t);                                         \
+    filter = vqadd##r##s8(filter, t);                                         \
+    filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask));               \
+                                                                              \
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */    \
+    /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ \
+    /* we'd round it by 3 the other way */                                    \
+    filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3);       \
+    filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3);       \
+                                                                              \
+    qs0 = vqsub##r##s8(qs0, filter1);                                         \
+    ps0 = vqadd##r##s8(ps0, filter2);                                         \
+    *oq0 = flip_sign_back_##w(qs0);                                           \
+    *op0 = flip_sign_back_##w(ps0);                                           \
+                                                                              \
+    /* outer tap adjustments */                                               \
+    filter = vrshr##r##n_s8(filter1, 1);                                      \
+    filter = vbic##r##s8(filter, vreinterpret##r##s8_u8(hev));                \
+                                                                              \
+    qs1 = vqsub##r##s8(qs1, filter);                                          \
+    ps1 = vqadd##r##s8(ps1, filter);                                          \
+    *oq1 = flip_sign_back_##w(qs1);                                           \
+    *op1 = flip_sign_back_##w(ps1);                                           \
+  }
+
+FUN_FILTER4(8, _)    // filter4_8
+FUN_FILTER4(16, q_)  // filter4_16
+#undef FUN_FILTER4
+
+#define FUN_FILTER8(w)                                                         \
+  static INLINE void filter8_##w(                                              \
+      const uint8x##w##_t mask, const uint8x##w##_t flat,                      \
+      const uint32_t flat_status, const uint8x##w##_t hev,                     \
+      const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1,  \
+      const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1,  \
+      const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2,      \
+      uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0,              \
+      uint8x##w##_t *oq1, uint8x##w##_t *oq2) {                                \
+    if (flat_status != (uint32_t)-2) {                                         \
+      filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1);              \
+      *op2 = p2;                                                               \
+      *oq2 = q2;                                                               \
+      if (flat_status) {                                                       \
+        apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
+                               op0, oq0, oq1, oq2);                            \
+      }                                                                        \
+    } else {                                                                   \
+      calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,     \
+                            oq0, oq1, oq2);                                    \
+    }                                                                          \
+  }
+
+FUN_FILTER8(8)   // filter8_8
+FUN_FILTER8(16)  // filter8_16
+#undef FUN_FILTER8
+
+#define FUN_FILTER16(w)                                                        \
+  static INLINE void filter16_##w(                                             \
+      const uint8x##w##_t mask, const uint8x##w##_t flat,                      \
+      const uint32_t flat_status, const uint8x##w##_t flat2,                   \
+      const uint32_t flat2_status, const uint8x##w##_t hev,                    \
+      const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5,  \
+      const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2,  \
+      const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0,  \
+      const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3,  \
+      const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6,  \
+      const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5,          \
+      uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2,              \
+      uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0,              \
+      uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3,              \
+      uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6) {            \
+    if (flat_status != (uint32_t)-2) {                                         \
+      filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1);              \
+    }                                                                          \
+                                                                               \
+    if (flat_status) {                                                         \
+      *op2 = p2;                                                               \
+      *oq2 = q2;                                                               \
+      if (flat2_status != (uint32_t)-2) {                                      \
+        apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
+                               op0, oq0, oq1, oq2);                            \
+      }                                                                        \
+      if (flat2_status) {                                                      \
+        apply_15_tap_filter_##w(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, \
+                                q2, q3, q4, q5, q6, q7, op6, op5, op4, op3,    \
+                                op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5,   \
+                                oq6);                                          \
+      }                                                                        \
+    }                                                                          \
+  }
+
+FUN_FILTER16(8)   // filter16_8
+FUN_FILTER16(16)  // filter16_16
+#undef FUN_FILTER16
+
+#define FUN_LOAD8(w, r)                                                    \
+  static INLINE void load_##w##x8(                                         \
+      const uint8_t *s, const int p, uint8x##w##_t *p3, uint8x##w##_t *p2, \
+      uint8x##w##_t *p1, uint8x##w##_t *p0, uint8x##w##_t *q0,             \
+      uint8x##w##_t *q1, uint8x##w##_t *q2, uint8x##w##_t *q3) {           \
+    *p3 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *p2 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *p1 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *p0 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *q0 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *q1 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *q2 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *q3 = vld1##r##u8(s);                                                  \
+  }
+
+FUN_LOAD8(8, _)    // load_8x8
+FUN_LOAD8(16, q_)  // load_16x8
+#undef FUN_LOAD8
+
+#define FUN_LOAD16(w, r)                                                   \
+  static INLINE void load_##w##x16(                                        \
+      const uint8_t *s, const int p, uint8x##w##_t *s0, uint8x##w##_t *s1, \
+      uint8x##w##_t *s2, uint8x##w##_t *s3, uint8x##w##_t *s4,             \
+      uint8x##w##_t *s5, uint8x##w##_t *s6, uint8x##w##_t *s7,             \
+      uint8x##w##_t *s8, uint8x##w##_t *s9, uint8x##w##_t *s10,            \
+      uint8x##w##_t *s11, uint8x##w##_t *s12, uint8x##w##_t *s13,          \
+      uint8x##w##_t *s14, uint8x##w##_t *s15) {                            \
+    *s0 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *s1 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *s2 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *s3 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *s4 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *s5 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *s6 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *s7 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *s8 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *s9 = vld1##r##u8(s);                                                  \
+    s += p;                                                                \
+    *s10 = vld1##r##u8(s);                                                 \
+    s += p;                                                                \
+    *s11 = vld1##r##u8(s);                                                 \
+    s += p;                                                                \
+    *s12 = vld1##r##u8(s);                                                 \
+    s += p;                                                                \
+    *s13 = vld1##r##u8(s);                                                 \
+    s += p;                                                                \
+    *s14 = vld1##r##u8(s);                                                 \
+    s += p;                                                                \
+    *s15 = vld1##r##u8(s);                                                 \
+  }
+
+FUN_LOAD16(8, _)    // load_8x16
+FUN_LOAD16(16, q_)  // load_16x16
+#undef FUN_LOAD16
+
+#define FUN_STORE4(w, r)                                                       \
+  static INLINE void store_##w##x4(                                            \
+      uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+      const uint8x##w##_t s2, const uint8x##w##_t s3) {                        \
+    vst1##r##u8(s, s0);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s1);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s2);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s3);                                                        \
+  }
+
+FUN_STORE4(8, _)    // store_8x4
+FUN_STORE4(16, q_)  // store_16x4
+#undef FUN_STORE4
+
+#define FUN_STORE6(w, r)                                                       \
+  static INLINE void store_##w##x6(                                            \
+      uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+      const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4,  \
+      const uint8x##w##_t s5) {                                                \
+    vst1##r##u8(s, s0);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s1);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s2);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s3);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s4);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s5);                                                        \
+  }
+
+FUN_STORE6(8, _)    // store_8x6
+FUN_STORE6(16, q_)  // store_16x6
+#undef FUN_STORE6
+
+static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
+                             const uint8x8_t p0, const uint8x8_t q0,
+                             const uint8x8_t q1) {
+  uint8x8x4_t o;
+
+  o.val[0] = p1;
+  o.val[1] = p0;
+  o.val[2] = q0;
+  o.val[3] = q1;
+  vst4_lane_u8(s, o, 0);
+  s += p;
+  vst4_lane_u8(s, o, 1);
+  s += p;
+  vst4_lane_u8(s, o, 2);
+  s += p;
+  vst4_lane_u8(s, o, 3);
+  s += p;
+  vst4_lane_u8(s, o, 4);
+  s += p;
+  vst4_lane_u8(s, o, 5);
+  s += p;
+  vst4_lane_u8(s, o, 6);
+  s += p;
+  vst4_lane_u8(s, o, 7);
+}
+
+static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
+                             const uint8x8_t s1, const uint8x8_t s2,
+                             const uint8x8_t s3, const uint8x8_t s4,
+                             const uint8x8_t s5) {
+  uint8x8x3_t o0, o1;
+
+  o0.val[0] = s0;
+  o0.val[1] = s1;
+  o0.val[2] = s2;
+  o1.val[0] = s3;
+  o1.val[1] = s4;
+  o1.val[2] = s5;
+  vst3_lane_u8(s - 3, o0, 0);
+  vst3_lane_u8(s + 0, o1, 0);
+  s += p;
+  vst3_lane_u8(s - 3, o0, 1);
+  vst3_lane_u8(s + 0, o1, 1);
+  s += p;
+  vst3_lane_u8(s - 3, o0, 2);
+  vst3_lane_u8(s + 0, o1, 2);
+  s += p;
+  vst3_lane_u8(s - 3, o0, 3);
+  vst3_lane_u8(s + 0, o1, 3);
+  s += p;
+  vst3_lane_u8(s - 3, o0, 4);
+  vst3_lane_u8(s + 0, o1, 4);
+  s += p;
+  vst3_lane_u8(s - 3, o0, 5);
+  vst3_lane_u8(s + 0, o1, 5);
+  s += p;
+  vst3_lane_u8(s - 3, o0, 6);
+  vst3_lane_u8(s + 0, o1, 6);
+  s += p;
+  vst3_lane_u8(s - 3, o0, 7);
+  vst3_lane_u8(s + 0, o1, 7);
+}
+
+#define FUN_STORE8(w, r)                                                       \
+  static INLINE void store_##w##x8(                                            \
+      uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
+      const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4,  \
+      const uint8x##w##_t s5, const uint8x##w##_t s6,                          \
+      const uint8x##w##_t s7) {                                                \
+    vst1##r##u8(s, s0);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s1);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s2);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s3);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s4);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s5);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s6);                                                        \
+    s += p;                                                                    \
+    vst1##r##u8(s, s7);                                                        \
+  }
+
+FUN_STORE8(8, _)    // store_8x8
+FUN_STORE8(16, q_)  // store_16x8
+#undef FUN_STORE8
+
+#define FUN_STORE14(w, r)                                                      \
+  static INLINE void store_##w##x14(                                           \
+      uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
+      const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2,  \
+      const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0,  \
+      const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3,  \
+      const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6,  \
+      const uint32_t flat_status, const uint32_t flat2_status) {               \
+    if (flat_status) {                                                         \
+      if (flat2_status) {                                                      \
+        vst1##r##u8(s - 7 * p, p6);                                            \
+        vst1##r##u8(s - 6 * p, p5);                                            \
+        vst1##r##u8(s - 5 * p, p4);                                            \
+        vst1##r##u8(s - 4 * p, p3);                                            \
+        vst1##r##u8(s + 3 * p, q3);                                            \
+        vst1##r##u8(s + 4 * p, q4);                                            \
+        vst1##r##u8(s + 5 * p, q5);                                            \
+        vst1##r##u8(s + 6 * p, q6);                                            \
+      }                                                                        \
+      vst1##r##u8(s - 3 * p, p2);                                              \
+      vst1##r##u8(s + 2 * p, q2);                                              \
+    }                                                                          \
+    vst1##r##u8(s - 2 * p, p1);                                                \
+    vst1##r##u8(s - 1 * p, p0);                                                \
+    vst1##r##u8(s + 0 * p, q0);                                                \
+    vst1##r##u8(s + 1 * p, q1);                                                \
+  }
+
+FUN_STORE14(8, _)    // store_8x14
+FUN_STORE14(16, q_)  // store_16x14
+#undef FUN_STORE14
+
+static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
+                               const uint8x16_t s1, const uint8x16_t s2,
+                               const uint8x16_t s3, const uint8x16_t s4,
+                               const uint8x16_t s5, const uint8x16_t s6,
+                               const uint8x16_t s7, const uint8x16_t s8,
+                               const uint8x16_t s9, const uint8x16_t s10,
+                               const uint8x16_t s11, const uint8x16_t s12,
+                               const uint8x16_t s13, const uint8x16_t s14,
+                               const uint8x16_t s15) {
+  vst1q_u8(s, s0);
+  s += p;
+  vst1q_u8(s, s1);
+  s += p;
+  vst1q_u8(s, s2);
+  s += p;
+  vst1q_u8(s, s3);
+  s += p;
+  vst1q_u8(s, s4);
+  s += p;
+  vst1q_u8(s, s5);
+  s += p;
+  vst1q_u8(s, s6);
+  s += p;
+  vst1q_u8(s, s7);
+  s += p;
+  vst1q_u8(s, s8);
+  s += p;
+  vst1q_u8(s, s9);
+  s += p;
+  vst1q_u8(s, s10);
+  s += p;
+  vst1q_u8(s, s11);
+  s += p;
+  vst1q_u8(s, s12);
+  s += p;
+  vst1q_u8(s, s13);
+  s += p;
+  vst1q_u8(s, s14);
+  s += p;
+  vst1q_u8(s, s15);
+}
+
+#define FUN_HOR_4_KERNEL(name, w)                                           \
+  static INLINE void lpf_horizontal_4##name##kernel(                        \
+      uint8_t *s, const int p, const uint8x##w##_t blimit,                  \
+      const uint8x##w##_t limit, const uint8x##w##_t thresh) {              \
+    uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev;                \
+                                                                            \
+    load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);     \
+    filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \
+                         q3, &hev, &mask);                                  \
+    filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);             \
+    store_##w##x4(s - 2 * p, p, p1, p0, q0, q1);                            \
+  }
+
+FUN_HOR_4_KERNEL(_, 8)        // lpf_horizontal_4_kernel
+FUN_HOR_4_KERNEL(_dual_, 16)  // lpf_horizontal_4_dual_kernel
+#undef FUN_HOR_4_KERNEL
+
+void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t blimit_vec, limit_vec, thresh_vec;
+  load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+  lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
+}
+
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                    const uint8_t *limit0,
+                                    const uint8_t *thresh0,
+                                    const uint8_t *blimit1,
+                                    const uint8_t *limit1,
+                                    const uint8_t *thresh1) {
+  uint8x16_t blimit_vec, limit_vec, thresh_vec;
+  load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+                     &blimit_vec, &limit_vec, &thresh_vec);
+  lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
+}
+
+void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      mask, hev;
+  load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+  load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+                     q2, q3, &hev, &mask);
+  filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
+  store_4x8(s - 2, p, p1, p0, q0, q1);
+}
 
 void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
                                   const uint8_t *limit0, const uint8_t *thresh0,
                                   const uint8_t *blimit1, const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+  uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      mask, hev;
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+      s15;
+
+  load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+                     &blimit_vec, &limit_vec, &thresh_vec);
+  load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
+            &s11, &s12, &s13, &s14, &s15);
+  transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+                    s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
+                      q2, q3, &hev, &mask);
+  filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
+  s -= 2;
+  store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0),
+            vget_low_u8(q1));
+  store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0),
+            vget_high_u8(q1));
+}
+
+void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+  uint32_t flat_status;
+
+  load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+  load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+                                p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+  filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+            &op1, &op0, &oq0, &oq1, &oq2);
+  store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                    const uint8_t *limit0,
+                                    const uint8_t *thresh0,
+                                    const uint8_t *blimit1,
+                                    const uint8_t *limit1,
+                                    const uint8_t *thresh1) {
+  uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+  uint32_t flat_status;
+
+  load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+                     &blimit_vec, &limit_vec, &thresh_vec);
+  load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+                                 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+  filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+             &op1, &op0, &oq0, &oq1, &oq2);
+  store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
+}
+
+void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+  uint32_t flat_status;
+
+  load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
+  load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+                                p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+  filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+            &op1, &op0, &oq0, &oq1, &oq2);
+  // Note: transpose + store_8x8() is faster than store_6x8().
+  transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
+  store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
+}
+
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
+      op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+      s15;
+  uint32_t flat_status;
+
+  load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
+                     &blimit_vec, &limit_vec, &thresh_vec);
+  load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
+            &s11, &s12, &s13, &s14, &s15);
+  transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+                    s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
+                                 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
+  filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
+             &op1, &op0, &oq0, &oq1, &oq2);
+  // Note: store_6x8() twice is faster than transpose + store_8x16().
+  store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
+            vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
+  store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
+            vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
+            vget_high_u8(oq2));
+}
+
+#define FUN_LPF_16_KERNEL(name, w)                                             \
+  static INLINE void lpf_16##name##kernel(                                     \
+      const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh,      \
+      const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5,  \
+      const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2,  \
+      const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0,  \
+      const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3,  \
+      const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6,  \
+      const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5,          \
+      uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2,              \
+      uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0,              \
+      uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3,              \
+      uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6,              \
+      uint32_t *flat_status, uint32_t *flat2_status) {                         \
+    uint8x##w##_t blimit_vec, limit_vec, thresh_vec, mask, flat, flat2, hev;   \
+                                                                               \
+    load_thresh_##w(blimit, limit, thresh, &blimit_vec, &limit_vec,            \
+                    &thresh_vec);                                              \
+    mask = filter_flat_hev_mask_##w(limit_vec, blimit_vec, thresh_vec, p3, p2, \
+                                    p1, p0, q0, q1, q2, q3, &flat,             \
+                                    flat_status, &hev);                        \
+    flat2 = flat_mask5_##w(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,       \
+                           flat2_status);                                      \
+    filter16_##w(mask, flat, *flat_status, flat2, *flat2_status, hev, p7, p6,  \
+                 p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,  \
+                 op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5,   \
+                 oq6);                                                         \
+  }
+
+FUN_LPF_16_KERNEL(_, 8)        // lpf_16_kernel
+FUN_LPF_16_KERNEL(_dual_, 16)  // lpf_16_dual_kernel
+#undef FUN_LPF_16_KERNEL
+
+void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
+      op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
+  uint32_t flat_status, flat2_status;
+
+  load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2,
+            &q3, &q4, &q5, &q6, &q7);
+  lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1,
+                q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1,
+                &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status,
+                &flat2_status);
+  store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
+             oq5, oq6, flat_status, flat2_status);
+}
+
+void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh) {
+  uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
+      op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
+  uint32_t flat_status, flat2_status;
+
+  load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  p7 = vld1q_u8(s - 8 * p);
+  p6 = vld1q_u8(s - 7 * p);
+  p5 = vld1q_u8(s - 6 * p);
+  p4 = vld1q_u8(s - 5 * p);
+  q4 = vld1q_u8(s + 4 * p);
+  q5 = vld1q_u8(s + 5 * p);
+  q6 = vld1q_u8(s + 6 * p);
+  q7 = vld1q_u8(s + 7 * p);
+  lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0,
+                     q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2,
+                     &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
+                     &flat_status, &flat2_status);
+  store_16x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
+              oq5, oq6, flat_status, flat2_status);
+}
+
+void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
+      op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
+  uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
+  uint32_t flat_status, flat2_status;
+
+  s -= 8;
+  load_16x8(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+  transpose_u8_16x8(s0, s1, s2, s3, s4, s5, s6, s7, &p7, &p6, &p5, &p4, &p3,
+                    &p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+  lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1,
+                q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1,
+                &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status,
+                &flat2_status);
+  if (flat_status) {
+    if (flat2_status) {
+      transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
+                        oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5,
+                        &s6, &s7);
+      store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
+    } else {
+      // Note: transpose + store_8x8() is faster than store_6x8().
+      transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
+      store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
+    }
+  } else {
+    store_4x8(s + 6, p, op1, op0, oq0, oq1);
+  }
+}
+
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh) {
+  uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
+      op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
+  uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
+      s15;
+  uint32_t flat_status, flat2_status;
+
+  s -= 8;
+  load_16x16(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, &s11,
+             &s12, &s13, &s14, &s15);
+  transpose_u8_16x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
+                     s14, s15, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1,
+                     &q2, &q3, &q4, &q5, &q6, &q7);
+  lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0,
+                     q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2,
+                     &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
+                     &flat_status, &flat2_status);
+  if (flat_status) {
+    if (flat2_status) {
+      transpose_u8_16x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
+                         oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5,
+                         &s6, &s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14,
+                         &s15);
+      store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+                  s13, s14, s15);
+    } else {
+      // Note: store_6x8() twice is faster than transpose + store_8x16().
+      s += 8;
+      store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
+                vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
+      store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
+                vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
+                vget_high_u8(oq2));
+    }
+  } else {
+    s += 6;
+    store_4x8(s, p, vget_low_u8(op1), vget_low_u8(op0), vget_low_u8(oq0),
+              vget_low_u8(oq1));
+    store_4x8(s + 8 * p, p, vget_high_u8(op1), vget_high_u8(op0),
+              vget_high_u8(oq0), vget_high_u8(oq1));
+  }
 }
diff --git a/libs/libvpx/vpx_dsp/arm/mem_neon.h b/libs/libvpx/vpx_dsp/arm/mem_neon.h
new file mode 100644
index 0000000000..4efad5333e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/mem_neon.h
@@ -0,0 +1,169 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_MEM_NEON_H_
+#define VPX_DSP_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Helper functions used to load tran_low_t into int16, narrowing if necessary.
+static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4x2_t v0 = vld2q_s32(buf);
+  const int32x4x2_t v1 = vld2q_s32(buf + 8);
+  const int16x4_t s0 = vmovn_s32(v0.val[0]);
+  const int16x4_t s1 = vmovn_s32(v0.val[1]);
+  const int16x4_t s2 = vmovn_s32(v1.val[0]);
+  const int16x4_t s3 = vmovn_s32(v1.val[1]);
+  int16x8x2_t res;
+  res.val[0] = vcombine_s16(s0, s2);
+  res.val[1] = vcombine_s16(s1, s3);
+  return res;
+#else
+  return vld2q_s16(buf);
+#endif
+}
+
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vld1q_s32(buf);
+  const int32x4_t v1 = vld1q_s32(buf + 4);
+  const int16x4_t s0 = vmovn_s32(v0);
+  const int16x4_t s1 = vmovn_s32(v1);
+  return vcombine_s16(s0, s1);
+#else
+  return vld1q_s16(buf);
+#endif
+}
+
+static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vld1q_s32(buf);
+  return vmovn_s32(v0);
+#else
+  return vld1_s16(buf);
+#endif
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+  vst1q_s32(buf, v0);
+  vst1q_s32(buf + 4, v1);
+#else
+  vst1q_s16(buf, a);
+#endif
+}
+
+// Propagate type information to the compiler. Without this the compiler may
+// assume the required alignment of uint32_t (4 bytes) and add alignment hints
+// to the memory access.
+//
+// This is used for functions operating on uint8_t which wish to load or store 4
+// values at a time but which may not be on 4 byte boundaries.
+static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
+  memcpy(buf, &a, 4);
+}
+
+// Load 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x2_t a_u32 = vdup_n_u32(0);
+  if (stride == 4) return vld1_u8(buf);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1_lane_u32(&a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  a_u32 = vld1_lane_u32(&a, a_u32, 1);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+// Store 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE void store_unaligned_u8(uint8_t *buf, int stride,
+                                      const uint8x8_t a) {
+  const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+  if (stride == 4) {
+    vst1_u8(buf, a);
+    return;
+  }
+  uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
+  buf += stride;
+  uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
+}
+
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x4_t a_u32 = vdupq_n_u32(0);
+  if (stride == 4) return vld1q_u8(buf);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 2);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1q_lane_u32(&a, a_u32, 3);
+  return vreinterpretq_u8_u32(a_u32);
+}
+
+// Store 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE void store_unaligned_u8q(uint8_t *buf, int stride,
+                                       const uint8x16_t a) {
+  const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
+  if (stride == 4) {
+    vst1q_u8(buf, a);
+    return;
+  }
+  uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
+  buf += stride;
+  uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
+  buf += stride;
+  uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2));
+  buf += stride;
+  uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3));
+}
+
+// Load 2 sets of 4 bytes when alignment is guaranteed.
+static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) {
+  uint32x2_t a = vdup_n_u32(0);
+
+  assert(!((intptr_t)buf % sizeof(uint32_t)));
+  assert(!(stride % sizeof(uint32_t)));
+
+  a = vld1_lane_u32((const uint32_t *)buf, a, 0);
+  buf += stride;
+  a = vld1_lane_u32((const uint32_t *)buf, a, 1);
+  return vreinterpret_u8_u32(a);
+}
+
+// Store 2 sets of 4 bytes when alignment is guaranteed.
+static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) {
+  uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+
+  assert(!((intptr_t)buf % sizeof(uint32_t)));
+  assert(!(stride % sizeof(uint32_t)));
+
+  vst1_lane_u32((uint32_t *)buf, a_u32, 0);
+  buf += stride;
+  vst1_lane_u32((uint32_t *)buf, a_u32, 1);
+}
+#endif  // VPX_DSP_ARM_MEM_NEON_H_
diff --git a/libs/libvpx/vpx_dsp/arm/quantize_neon.c b/libs/libvpx/vpx_dsp/arm/quantize_neon.c
new file mode 100644
index 0000000000..a0a1e6dd5a
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/quantize_neon.c
@@ -0,0 +1,296 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         int skip_block, const int16_t *zbin_ptr,
+                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan_ptr,
+                         const int16_t *iscan_ptr) {
+  const int16x8_t one = vdupq_n_s16(1);
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+  (void)scan_ptr;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // Process first 8 values which include a dc component.
+  {
+    // Only the first element of each vector is DC.
+    const int16x8_t zbin = vld1q_s16(zbin_ptr);
+    const int16x8_t round = vld1q_s16(round_ptr);
+    const int16x8_t quant = vld1q_s16(quant_ptr);
+    const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+    const int16x8_t dequant = vld1q_s16(dequant_ptr);
+    // Add one because the eob does not index from 0.
+    const uint16x8_t iscan =
+        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+
+    const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+    const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+    const int16x8_t zbin_mask =
+        vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+    const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+    // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+    int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+    qcoeff = vaddq_s16(qcoeff, rounded);
+
+    // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+    qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+
+    // Restore the sign bit.
+    qcoeff = veorq_s16(qcoeff, coeff_sign);
+    qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+    qcoeff = vandq_s16(qcoeff, zbin_mask);
+
+    // Set non-zero elements to -1 and use that to extract values for eob.
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+
+    coeff_ptr += 8;
+    iscan_ptr += 8;
+
+    store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+    qcoeff_ptr += 8;
+
+    qcoeff = vmulq_s16(qcoeff, dequant);
+
+    store_s16q_to_tran_low(dqcoeff_ptr, qcoeff);
+    dqcoeff_ptr += 8;
+  }
+
+  n_coeffs -= 8;
+
+  {
+    const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]);
+    const int16x8_t round = vdupq_n_s16(round_ptr[1]);
+    const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
+    const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+    const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+
+    do {
+      // Add one because the eob is not its index.
+      const uint16x8_t iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+
+      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+      const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+      const int16x8_t zbin_mask =
+          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+      const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+      // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+      int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+      qcoeff = vaddq_s16(qcoeff, rounded);
+
+      // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+      qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+
+      // Restore the sign bit.
+      qcoeff = veorq_s16(qcoeff, coeff_sign);
+      qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+      qcoeff = vandq_s16(qcoeff, zbin_mask);
+
+      // Set non-zero elements to -1 and use that to extract values for eob.
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+
+      coeff_ptr += 8;
+      iscan_ptr += 8;
+
+      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+      qcoeff_ptr += 8;
+
+      qcoeff = vmulq_s16(qcoeff, dequant);
+
+      store_s16q_to_tran_low(dqcoeff_ptr, qcoeff);
+      dqcoeff_ptr += 8;
+
+      n_coeffs -= 8;
+    } while (n_coeffs > 0);
+  }
+
+  {
+    const uint16x4_t eob_max_0 =
+        vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+    const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+    const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+    vst1_lane_u16(eob_ptr, eob_max_2, 0);
+  }
+}
+
+static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+// Main difference is that zbin values are halved before comparison and dqcoeff
+// values are divided by 2. zbin is rounded but dqcoeff is not.
+void vpx_quantize_b_32x32_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  const int16x8_t one = vdupq_n_s16(1);
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+  int i;
+  (void)scan_ptr;
+  (void)n_coeffs;  // Because we will always calculate 32*32.
+  (void)skip_block;
+  assert(!skip_block);
+
+  // Process first 8 values which include a dc component.
+  {
+    // Only the first element of each vector is DC.
+    const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
+    const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+    const int16x8_t quant = vld1q_s16(quant_ptr);
+    const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+    const int16x8_t dequant = vld1q_s16(dequant_ptr);
+    // Add one because the eob does not index from 0.
+    const uint16x8_t iscan =
+        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+
+    const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+    const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+    const int16x8_t zbin_mask =
+        vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+    const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+    // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+    int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+    int16x8_t dqcoeff;
+    int32x4_t dqcoeff_0, dqcoeff_1;
+
+    qcoeff = vaddq_s16(qcoeff, rounded);
+
+    // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+    qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+    // Restore the sign bit.
+    qcoeff = veorq_s16(qcoeff, coeff_sign);
+    qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+    qcoeff = vandq_s16(qcoeff, zbin_mask);
+
+    // Set non-zero elements to -1 and use that to extract values for eob.
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+
+    coeff_ptr += 8;
+    iscan_ptr += 8;
+
+    store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+    qcoeff_ptr += 8;
+
+    dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+    dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+    // Add 1 if negative to round towards zero because the C uses division.
+    dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+    dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+    dqcoeff =
+        vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
+
+    store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+    dqcoeff_ptr += 8;
+  }
+
+  {
+    const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1);
+    const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1);
+    const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
+    const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+    const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+
+    for (i = 1; i < 32 * 32 / 8; ++i) {
+      // Add one because the eob is not its index.
+      const uint16x8_t iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+
+      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+      const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+      const int16x8_t zbin_mask =
+          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+      const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+      // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+      int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+      int16x8_t dqcoeff;
+      int32x4_t dqcoeff_0, dqcoeff_1;
+
+      qcoeff = vaddq_s16(qcoeff, rounded);
+
+      // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+      qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+      // Restore the sign bit.
+      qcoeff = veorq_s16(qcoeff, coeff_sign);
+      qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+      qcoeff = vandq_s16(qcoeff, zbin_mask);
+
+      // Set non-zero elements to -1 and use that to extract values for eob.
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+
+      coeff_ptr += 8;
+      iscan_ptr += 8;
+
+      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+      qcoeff_ptr += 8;
+
+      dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+      dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+      dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+      dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+      dqcoeff =
+          vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
+
+      store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+      dqcoeff_ptr += 8;
+    }
+  }
+
+  {
+    const uint16x4_t eob_max_0 =
+        vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+    const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+    const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+    vst1_lane_u16(eob_ptr, eob_max_2, 0);
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/arm/sad4d_neon.c b/libs/libvpx/vpx_dsp/arm/sad4d_neon.c
index dc20398000..b04de3aff2 100644
--- a/libs/libvpx/vpx_dsp/arm/sad4d_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/sad4d_neon.c
@@ -13,212 +13,230 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
-                                                    const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo =
-      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi =
-      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
-  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-
-// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
-// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
-// and vec_sum_ref_hi.
-static void sad_neon_64(const uint8x16_t vec_src_00,
-                        const uint8x16_t vec_src_16,
-                        const uint8x16_t vec_src_32,
-                        const uint8x16_t vec_src_48, const uint8_t *ref,
-                        uint16x8_t *vec_sum_ref_lo,
-                        uint16x8_t *vec_sum_ref_hi) {
-  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-  const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
-  const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
-                             vget_low_u8(vec_ref_00));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
-                             vget_high_u8(vec_ref_00));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
-                             vget_low_u8(vec_ref_16));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
-                             vget_high_u8(vec_ref_16));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
-                             vget_low_u8(vec_ref_32));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
-                             vget_high_u8(vec_ref_32));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
-                             vget_low_u8(vec_ref_48));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
-                             vget_high_u8(vec_ref_48));
-}
-
-// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
-// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
-static void sad_neon_32(const uint8x16_t vec_src_00,
-                        const uint8x16_t vec_src_16, const uint8_t *ref,
-                        uint16x8_t *vec_sum_ref_lo,
-                        uint16x8_t *vec_sum_ref_hi) {
-  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
-                             vget_low_u8(vec_ref_00));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
-                             vget_high_u8(vec_ref_00));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
-                             vget_low_u8(vec_ref_16));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
-                             vget_high_u8(vec_ref_16));
-}
-
-void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
+void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride,
+                        const uint8_t *const ref[4], int ref_stride,
+                        uint32_t *res) {
   int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
+  const uint8x16_t src_u8 = load_unaligned_u8q(src, src_stride);
+  for (i = 0; i < 4; ++i) {
+    const uint8x16_t ref_u8 = load_unaligned_u8q(ref[i], ref_stride);
+    uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
+    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
+    res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+  }
+}
 
-  for (i = 0; i < 64; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
-    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride,
+                        const uint8_t *const ref[4], int ref_stride,
+                        uint32_t *res) {
+  int i;
+  const uint8x16_t src_0 = load_unaligned_u8q(src, src_stride);
+  const uint8x16_t src_1 = load_unaligned_u8q(src + 4 * src_stride, src_stride);
+  for (i = 0; i < 4; ++i) {
+    const uint8x16_t ref_0 = load_unaligned_u8q(ref[i], ref_stride);
+    const uint8x16_t ref_1 =
+        load_unaligned_u8q(ref[i] + 4 * ref_stride, ref_stride);
+    uint16x8_t abs = vabdl_u8(vget_low_u8(src_0), vget_low_u8(ref_0));
+    abs = vabal_u8(abs, vget_high_u8(src_0), vget_high_u8(ref_0));
+    abs = vabal_u8(abs, vget_low_u8(src_1), vget_low_u8(ref_1));
+    abs = vabal_u8(abs, vget_high_u8(src_1), vget_high_u8(ref_1));
+    res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+  }
+}
 
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
-                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
-                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
-                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
-                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+static INLINE void sad8x_4d(const uint8_t *a, int a_stride,
+                            const uint8_t *const b[4], int b_stride,
+                            uint32_t *result, const int height) {
+  int i, j;
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
 
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
+  for (i = 0; i < height; ++i) {
+    const uint8x8_t a_u8 = vld1_u8(a);
+    a += a_stride;
+    for (j = 0; j < 4; ++j) {
+      const uint8x8_t b_u8 = vld1_u8(b_loop[j]);
+      b_loop[j] += b_stride;
+      sum[j] = vabal_u8(sum[j], a_u8, b_u8);
+    }
   }
 
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+  for (j = 0; j < 4; ++j) {
+    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
+  }
 }
 
-void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
+void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride,
+                        const uint8_t *const ref[4], int ref_stride,
+                        uint32_t *res) {
+  sad8x_4d(src, src_stride, ref, ref_stride, res, 4);
+}
 
-  for (i = 0; i < 32; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+void vpx_sad8x8x4d_neon(const uint8_t *src, int src_stride,
+                        const uint8_t *const ref[4], int ref_stride,
+                        uint32_t *res) {
+  sad8x_4d(src, src_stride, ref, ref_stride, res, 8);
+}
 
-    sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
-                &vec_sum_ref0_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
-                &vec_sum_ref1_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
-                &vec_sum_ref2_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
-                &vec_sum_ref3_hi);
+void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride,
+                         const uint8_t *const ref[4], int ref_stride,
+                         uint32_t *res) {
+  sad8x_4d(src, src_stride, ref, ref_stride, res, 16);
+}
 
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
+static INLINE void sad16x_4d(const uint8_t *a, int a_stride,
+                             const uint8_t *const b[4], int b_stride,
+                             uint32_t *result, const int height) {
+  int i, j;
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_u8 = vld1q_u8(a);
+    a += a_stride;
+    for (j = 0; j < 4; ++j) {
+      const uint8x16_t b_u8 = vld1q_u8(b_loop[j]);
+      b_loop[j] += b_stride;
+      sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8));
+      sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8));
+    }
   }
 
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+  for (j = 0; j < 4; ++j) {
+    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
+  }
+}
+
+void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride,
+                         const uint8_t *const ref[4], int ref_stride,
+                         uint32_t *res) {
+  sad16x_4d(src, src_stride, ref, ref_stride, res, 8);
 }
 
 void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
                           const uint8_t *const ref[4], int ref_stride,
                           uint32_t *res) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
+  sad16x_4d(src, src_stride, ref, ref_stride, res, 16);
+}
 
-  for (i = 0; i < 16; ++i) {
-    const uint8x16_t vec_src = vld1q_u8(src);
-    const uint8x16_t vec_ref0 = vld1q_u8(ref0);
-    const uint8x16_t vec_ref1 = vld1q_u8(ref1);
-    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
-    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
+void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  sad16x_4d(src, src_stride, ref, ref_stride, res, 32);
+}
 
-    vec_sum_ref0_lo =
-        vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
-    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref0));
-    vec_sum_ref1_lo =
-        vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
-    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref1));
-    vec_sum_ref2_lo =
-        vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
-    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref2));
-    vec_sum_ref3_lo =
-        vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
-    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref3));
+static INLINE void sad32x_4d(const uint8_t *a, int a_stride,
+                             const uint8_t *const b[4], int b_stride,
+                             uint32_t *result, const int height) {
+  int i, j;
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
 
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_0 = vld1q_u8(a);
+    const uint8x16_t a_1 = vld1q_u8(a + 16);
+    a += a_stride;
+    for (j = 0; j < 4; ++j) {
+      const uint8x16_t b_0 = vld1q_u8(b_loop[j]);
+      const uint8x16_t b_1 = vld1q_u8(b_loop[j] + 16);
+      b_loop[j] += b_stride;
+      sum[j] = vabal_u8(sum[j], vget_low_u8(a_0), vget_low_u8(b_0));
+      sum[j] = vabal_u8(sum[j], vget_high_u8(a_0), vget_high_u8(b_0));
+      sum[j] = vabal_u8(sum[j], vget_low_u8(a_1), vget_low_u8(b_1));
+      sum[j] = vabal_u8(sum[j], vget_high_u8(a_1), vget_high_u8(b_1));
+    }
   }
 
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+  for (j = 0; j < 4; ++j) {
+    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
+  }
+}
+
+void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  sad32x_4d(src, src_stride, ref, ref_stride, res, 16);
+}
+
+void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  sad32x_4d(src, src_stride, ref, ref_stride, res, 32);
+}
+
+void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  sad32x_4d(src, src_stride, ref, ref_stride, res, 64);
+}
+
+static INLINE void sum64x(const uint8x16_t a_0, const uint8x16_t a_1,
+                          const uint8x16_t b_0, const uint8x16_t b_1,
+                          uint16x8_t *sum) {
+  *sum = vabal_u8(*sum, vget_low_u8(a_0), vget_low_u8(b_0));
+  *sum = vabal_u8(*sum, vget_high_u8(a_0), vget_high_u8(b_0));
+  *sum = vabal_u8(*sum, vget_low_u8(a_1), vget_low_u8(b_1));
+  *sum = vabal_u8(*sum, vget_high_u8(a_1), vget_high_u8(b_1));
+}
+
+static INLINE void sad64x_4d(const uint8_t *a, int a_stride,
+                             const uint8_t *const b[4], int b_stride,
+                             uint32_t *result, const int height) {
+  int i;
+  uint16x8_t sum_0 = vdupq_n_u16(0);
+  uint16x8_t sum_1 = vdupq_n_u16(0);
+  uint16x8_t sum_2 = vdupq_n_u16(0);
+  uint16x8_t sum_3 = vdupq_n_u16(0);
+  uint16x8_t sum_4 = vdupq_n_u16(0);
+  uint16x8_t sum_5 = vdupq_n_u16(0);
+  uint16x8_t sum_6 = vdupq_n_u16(0);
+  uint16x8_t sum_7 = vdupq_n_u16(0);
+  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_0 = vld1q_u8(a);
+    const uint8x16_t a_1 = vld1q_u8(a + 16);
+    const uint8x16_t a_2 = vld1q_u8(a + 32);
+    const uint8x16_t a_3 = vld1q_u8(a + 48);
+    a += a_stride;
+    sum64x(a_0, a_1, vld1q_u8(b_loop[0]), vld1q_u8(b_loop[0] + 16), &sum_0);
+    sum64x(a_2, a_3, vld1q_u8(b_loop[0] + 32), vld1q_u8(b_loop[0] + 48),
+           &sum_1);
+    b_loop[0] += b_stride;
+    sum64x(a_0, a_1, vld1q_u8(b_loop[1]), vld1q_u8(b_loop[1] + 16), &sum_2);
+    sum64x(a_2, a_3, vld1q_u8(b_loop[1] + 32), vld1q_u8(b_loop[1] + 48),
+           &sum_3);
+    b_loop[1] += b_stride;
+    sum64x(a_0, a_1, vld1q_u8(b_loop[2]), vld1q_u8(b_loop[2] + 16), &sum_4);
+    sum64x(a_2, a_3, vld1q_u8(b_loop[2] + 32), vld1q_u8(b_loop[2] + 48),
+           &sum_5);
+    b_loop[2] += b_stride;
+    sum64x(a_0, a_1, vld1q_u8(b_loop[3]), vld1q_u8(b_loop[3] + 16), &sum_6);
+    sum64x(a_2, a_3, vld1q_u8(b_loop[3] + 32), vld1q_u8(b_loop[3] + 48),
+           &sum_7);
+    b_loop[3] += b_stride;
+  }
+
+  result[0] = vget_lane_u32(horizontal_add_long_uint16x8(sum_0, sum_1), 0);
+  result[1] = vget_lane_u32(horizontal_add_long_uint16x8(sum_2, sum_3), 0);
+  result[2] = vget_lane_u32(horizontal_add_long_uint16x8(sum_4, sum_5), 0);
+  result[3] = vget_lane_u32(horizontal_add_long_uint16x8(sum_6, sum_7), 0);
+}
+
+void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  sad64x_4d(src, src_stride, ref, ref_stride, res, 32);
+}
+
+void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  sad64x_4d(src, src_stride, ref, ref_stride, res, 64);
 }
diff --git a/libs/libvpx/vpx_dsp/arm/sad_neon.c b/libs/libvpx/vpx_dsp/arm/sad_neon.c
index ff3228768c..9518a166bb 100644
--- a/libs/libvpx/vpx_dsp/arm/sad_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/sad_neon.c
@@ -13,211 +13,332 @@
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
-unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride,
-                              unsigned char *ref_ptr, int ref_stride) {
-  uint8x8_t d0, d8;
-  uint16x8_t q12;
-  uint32x4_t q1;
-  uint64x2_t q3;
-  uint32x2_t d5;
+uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
+                         const uint8_t *ref_ptr, int ref_stride) {
+  const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
+  const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+  uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
+  abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
+  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+}
+
+uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride,
+                             const uint8_t *second_pred) {
+  const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
+  const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+  const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
+  const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
+  uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
+  abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
+  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+}
+
+uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
+                         const uint8_t *ref_ptr, int ref_stride) {
   int i;
-
-  d0 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(d0, d8);
-
-  for (i = 0; i < 15; i++) {
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, d0, d8);
+  uint16x8_t abs = vdupq_n_u16(0);
+  for (i = 0; i < 8; i += 4) {
+    const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
+    const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+    src_ptr += 4 * src_stride;
+    ref_ptr += 4 * ref_stride;
+    abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8));
+    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
   }
 
-  q1 = vpaddlq_u16(q12);
-  q3 = vpaddlq_u32(q1);
-  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                vreinterpret_u32_u64(vget_high_u64(q3)));
-
-  return vget_lane_u32(d5, 0);
+  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
 }
 
-unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride,
-                             unsigned char *ref_ptr, int ref_stride) {
-  uint8x8_t d0, d8;
-  uint16x8_t q12;
-  uint32x2_t d1;
-  uint64x1_t d3;
+uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride,
+                             const uint8_t *second_pred) {
   int i;
-
-  d0 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(d0, d8);
-
-  for (i = 0; i < 3; i++) {
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, d0, d8);
+  uint16x8_t abs = vdupq_n_u16(0);
+  for (i = 0; i < 8; i += 4) {
+    const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
+    const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+    const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
+    const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
+    src_ptr += 4 * src_stride;
+    ref_ptr += 4 * ref_stride;
+    second_pred += 16;
+    abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg));
+    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
   }
 
-  d1 = vpaddl_u16(vget_low_u16(q12));
-  d3 = vpaddl_u32(d1);
-
-  return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
 }
 
-unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride,
-                              unsigned char *ref_ptr, int ref_stride) {
-  uint8x16_t q0, q4;
-  uint16x8_t q12, q13;
-  uint32x4_t q1;
-  uint64x2_t q3;
-  uint32x2_t d5;
+static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, const int height) {
   int i;
+  uint16x8_t abs = vdupq_n_u16(0);
 
-  q0 = vld1q_u8(src_ptr);
-  src_ptr += src_stride;
-  q4 = vld1q_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
-  q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+  for (i = 0; i < height; ++i) {
+    const uint8x8_t a_u8 = vld1_u8(a);
+    const uint8x8_t b_u8 = vld1_u8(b);
+    a += a_stride;
+    b += b_stride;
+    abs = vabal_u8(abs, a_u8, b_u8);
+  }
+  return abs;
+}
 
-  for (i = 0; i < 7; i++) {
-    q0 = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    q4 = vld1q_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
-    q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride,
+                                   const uint8_t *b, int b_stride,
+                                   const uint8_t *c, const int height) {
+  int i;
+  uint16x8_t abs = vdupq_n_u16(0);
+
+  for (i = 0; i < height; ++i) {
+    const uint8x8_t a_u8 = vld1_u8(a);
+    const uint8x8_t b_u8 = vld1_u8(b);
+    const uint8x8_t c_u8 = vld1_u8(c);
+    const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
+    a += a_stride;
+    b += b_stride;
+    c += 8;
+    abs = vabal_u8(abs, a_u8, avg);
+  }
+  return abs;
+}
+
+#define sad8xN(n)                                                      \
+  uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride,     \
+                               const uint8_t *ref, int ref_stride) {   \
+    const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);             \
+  }                                                                    \
+                                                                       \
+  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \
+                                   const uint8_t *ref, int ref_stride, \
+                                   const uint8_t *second_pred) {       \
+    const uint16x8_t abs =                                             \
+        sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);             \
   }
 
-  q12 = vaddq_u16(q12, q13);
-  q1 = vpaddlq_u16(q12);
-  q3 = vpaddlq_u32(q1);
-  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                vreinterpret_u32_u64(vget_high_u64(q3)));
+sad8xN(4);
+sad8xN(8);
+sad8xN(16);
 
-  return vget_lane_u32(d5, 0);
-}
-
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
-                                                    const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo =
-      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi =
-      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
-  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
-  const uint32x4_t a = vpaddlq_u16(vec_16x8);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-
-unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
+static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride,
+                                const uint8_t *b, int b_stride,
+                                const int height) {
   int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-  for (i = 0; i < 64; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
-    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
-    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
-    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
-                            vget_low_u8(vec_ref_00));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
-                            vget_high_u8(vec_ref_00));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
-                            vget_low_u8(vec_ref_16));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
-                            vget_high_u8(vec_ref_16));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
-                            vget_low_u8(vec_ref_32));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
-                            vget_high_u8(vec_ref_32));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
-                            vget_low_u8(vec_ref_48));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
-                            vget_high_u8(vec_ref_48));
+  uint16x8_t abs = vdupq_n_u16(0);
+
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_u8 = vld1q_u8(a);
+    const uint8x16_t b_u8 = vld1q_u8(b);
+    a += a_stride;
+    b += b_stride;
+    abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));
+    abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));
   }
-  return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
+  return abs;
 }
 
-unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
+static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    const uint8_t *c, const int height) {
   int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+  uint16x8_t abs = vdupq_n_u16(0);
 
-  for (i = 0; i < 32; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
-                            vget_low_u8(vec_ref_00));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
-                            vget_high_u8(vec_ref_00));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
-                            vget_low_u8(vec_ref_16));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
-                            vget_high_u8(vec_ref_16));
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_u8 = vld1q_u8(a);
+    const uint8x16_t b_u8 = vld1q_u8(b);
+    const uint8x16_t c_u8 = vld1q_u8(c);
+    const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
+    a += a_stride;
+    b += b_stride;
+    c += 16;
+    abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));
+    abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));
   }
-  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+  return abs;
 }
 
-unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
+#define sad16xN(n)                                                      \
+  uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride,     \
+                                const uint8_t *ref, int ref_stride) {   \
+    const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
+  }                                                                     \
+                                                                        \
+  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \
+                                    const uint8_t *ref, int ref_stride, \
+                                    const uint8_t *second_pred) {       \
+    const uint16x8_t abs =                                              \
+        sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
+  }
+
+sad16xN(8);
+sad16xN(16);
+sad16xN(32);
+
+static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride,
+                                const uint8_t *b, int b_stride,
+                                const int height) {
   int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+  uint16x8_t abs = vdupq_n_u16(0);
 
-  for (i = 0; i < 16; ++i) {
-    const uint8x16_t vec_src = vld1q_u8(src);
-    const uint8x16_t vec_ref = vld1q_u8(ref);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo =
-        vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
-    vec_accum_hi =
-        vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_lo = vld1q_u8(a);
+    const uint8x16_t a_hi = vld1q_u8(a + 16);
+    const uint8x16_t b_lo = vld1q_u8(b);
+    const uint8x16_t b_hi = vld1q_u8(b + 16);
+    a += a_stride;
+    b += b_stride;
+    abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));
+    abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));
+    abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));
+    abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi));
   }
-  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+  return abs;
 }
 
-unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride) {
+static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    const uint8_t *c, const int height) {
   int i;
-  uint16x8_t vec_accum = vdupq_n_u16(0);
+  uint16x8_t abs = vdupq_n_u16(0);
 
-  for (i = 0; i < 8; ++i) {
-    const uint8x8_t vec_src = vld1_u8(src);
-    const uint8x8_t vec_ref = vld1_u8(ref);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_lo = vld1q_u8(a);
+    const uint8x16_t a_hi = vld1q_u8(a + 16);
+    const uint8x16_t b_lo = vld1q_u8(b);
+    const uint8x16_t b_hi = vld1q_u8(b + 16);
+    const uint8x16_t c_lo = vld1q_u8(c);
+    const uint8x16_t c_hi = vld1q_u8(c + 16);
+    const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
+    const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
+    a += a_stride;
+    b += b_stride;
+    c += 32;
+    abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));
+    abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));
+    abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));
+    abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi));
   }
-  return horizontal_add_16x8(vec_accum);
+  return abs;
 }
+
+#define sad32xN(n)                                                      \
+  uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride,     \
+                                const uint8_t *ref, int ref_stride) {   \
+    const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
+  }                                                                     \
+                                                                        \
+  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \
+                                    const uint8_t *ref, int ref_stride, \
+                                    const uint8_t *second_pred) {       \
+    const uint16x8_t abs =                                              \
+        sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
+  }
+
+sad32xN(16);
+sad32xN(32);
+sad32xN(64);
+
+static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride,
+                                const uint8_t *b, int b_stride,
+                                const int height) {
+  int i;
+  uint16x8_t abs_0 = vdupq_n_u16(0);
+  uint16x8_t abs_1 = vdupq_n_u16(0);
+
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_0 = vld1q_u8(a);
+    const uint8x16_t a_1 = vld1q_u8(a + 16);
+    const uint8x16_t a_2 = vld1q_u8(a + 32);
+    const uint8x16_t a_3 = vld1q_u8(a + 48);
+    const uint8x16_t b_0 = vld1q_u8(b);
+    const uint8x16_t b_1 = vld1q_u8(b + 16);
+    const uint8x16_t b_2 = vld1q_u8(b + 32);
+    const uint8x16_t b_3 = vld1q_u8(b + 48);
+    a += a_stride;
+    b += b_stride;
+    abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));
+    abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));
+    abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));
+    abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1));
+    abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2));
+    abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2));
+    abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3));
+    abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3));
+  }
+
+  {
+    const uint32x4_t sum = vpaddlq_u16(abs_0);
+    return vpadalq_u16(sum, abs_1);
+  }
+}
+
+static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    const uint8_t *c, const int height) {
+  int i;
+  uint16x8_t abs_0 = vdupq_n_u16(0);
+  uint16x8_t abs_1 = vdupq_n_u16(0);
+
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_0 = vld1q_u8(a);
+    const uint8x16_t a_1 = vld1q_u8(a + 16);
+    const uint8x16_t a_2 = vld1q_u8(a + 32);
+    const uint8x16_t a_3 = vld1q_u8(a + 48);
+    const uint8x16_t b_0 = vld1q_u8(b);
+    const uint8x16_t b_1 = vld1q_u8(b + 16);
+    const uint8x16_t b_2 = vld1q_u8(b + 32);
+    const uint8x16_t b_3 = vld1q_u8(b + 48);
+    const uint8x16_t c_0 = vld1q_u8(c);
+    const uint8x16_t c_1 = vld1q_u8(c + 16);
+    const uint8x16_t c_2 = vld1q_u8(c + 32);
+    const uint8x16_t c_3 = vld1q_u8(c + 48);
+    const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
+    const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
+    const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
+    const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
+    a += a_stride;
+    b += b_stride;
+    c += 64;
+    abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));
+    abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));
+    abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));
+    abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1));
+    abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2));
+    abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2));
+    abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3));
+    abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3));
+  }
+
+  {
+    const uint32x4_t sum = vpaddlq_u16(abs_0);
+    return vpadalq_u16(sum, abs_1);
+  }
+}
+
+#define sad64xN(n)                                                      \
+  uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride,     \
+                                const uint8_t *ref, int ref_stride) {   \
+    const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \
+    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);              \
+  }                                                                     \
+                                                                        \
+  uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \
+                                    const uint8_t *ref, int ref_stride, \
+                                    const uint8_t *second_pred) {       \
+    const uint32x4_t abs =                                              \
+        sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
+    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);              \
+  }
+
+sad64xN(32);
+sad64xN(64);
diff --git a/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c
index f044e11a15..4f58a7832a 100644
--- a/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c
@@ -12,25 +12,48 @@
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
-#include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
 
 #include "vpx_dsp/variance.h"
+#include "vpx_dsp/arm/mem_neon.h"
 
 static const uint8_t bilinear_filters[8][2] = {
   { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
   { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
 };
 
+// Process a block exactly 4 wide and a multiple of 2 high.
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+                                      uint8_t *output_ptr,
+                                      unsigned int src_pixels_per_line,
+                                      int pixel_step,
+                                      unsigned int output_height,
+                                      const uint8_t *filter) {
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; i += 2) {
+    const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
+    const uint8x8_t src_1 =
+        load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
+    const uint16x8_t a = vmull_u8(src_0, f0);
+    const uint16x8_t b = vmlal_u8(a, src_1, f1);
+    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+    vst1_u8(output_ptr, out);
+    src_ptr += 2 * src_pixels_per_line;
+    output_ptr += 8;
+  }
+}
+
+// Process a block exactly 8 wide and any height.
 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                       uint8_t *output_ptr,
                                       unsigned int src_pixels_per_line,
                                       int pixel_step,
                                       unsigned int output_height,
-                                      unsigned int output_width,
                                       const uint8_t *filter) {
-  const uint8x8_t f0 = vmov_n_u8(filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
   unsigned int i;
   for (i = 0; i < output_height; ++i) {
     const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
@@ -38,13 +61,13 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
     const uint16x8_t a = vmull_u8(src_0, f0);
     const uint16x8_t b = vmlal_u8(a, src_1, f1);
     const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(&output_ptr[0], out);
-    // Next row...
+    vst1_u8(output_ptr, out);
     src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
+    output_ptr += 8;
   }
 }
 
+// Process a block which is a mutiple of 16 wide and any height.
 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
                                        uint8_t *output_ptr,
                                        unsigned int src_pixels_per_line,
@@ -52,8 +75,8 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
                                        unsigned int output_height,
                                        unsigned int output_width,
                                        const uint8_t *filter) {
-  const uint8x8_t f0 = vmov_n_u8(filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
   unsigned int i, j;
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; j += 16) {
@@ -65,69 +88,97 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
       const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
       const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
       const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
-      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+      vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
     }
-    // Next row...
     src_ptr += src_pixels_per_line;
     output_ptr += output_width;
   }
 }
 
-unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
-                                            int xoffset, int yoffset,
-                                            const uint8_t *dst, int dst_stride,
-                                            unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
+// 4xM filter writes an extra row to fdata because it processes two rows at a
+// time.
+#define sub_pixel_varianceNxM(n, m)                                 \
+  uint32_t vpx_sub_pixel_variance##n##x##m##_neon(                  \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
+      const uint8_t *b, int b_stride, uint32_t *sse) {              \
+    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                      \
+    uint8_t temp1[n * m];                                           \
+                                                                    \
+    if (n == 4) {                                                   \
+      var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2),     \
+                                bilinear_filters[xoffset]);         \
+      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,              \
+                                bilinear_filters[yoffset]);         \
+    } else if (n == 8) {                                            \
+      var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1),     \
+                                bilinear_filters[xoffset]);         \
+      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,              \
+                                bilinear_filters[yoffset]);         \
+    } else {                                                        \
+      var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
+                                 bilinear_filters[xoffset]);        \
+      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,          \
+                                 bilinear_filters[yoffset]);        \
+    }                                                               \
+    return vpx_variance##n##x##m(temp1, n, b, b_stride, sse);       \
+  }
 
-  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
-                            bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
-                            bilinear_filters[yoffset]);
-  return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
-}
+sub_pixel_varianceNxM(4, 4);
+sub_pixel_varianceNxM(4, 8);
+sub_pixel_varianceNxM(8, 4);
+sub_pixel_varianceNxM(8, 8);
+sub_pixel_varianceNxM(8, 16);
+sub_pixel_varianceNxM(16, 8);
+sub_pixel_varianceNxM(16, 16);
+sub_pixel_varianceNxM(16, 32);
+sub_pixel_varianceNxM(32, 16);
+sub_pixel_varianceNxM(32, 32);
+sub_pixel_varianceNxM(32, 64);
+sub_pixel_varianceNxM(64, 32);
+sub_pixel_varianceNxM(64, 64);
 
-unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
+// 4xM filter writes an extra row to fdata because it processes two rows at a
+// time.
+#define sub_pixel_avg_varianceNxM(n, m)                             \
+  uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon(              \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
+      const uint8_t *b, int b_stride, uint32_t *sse,                \
+      const uint8_t *second_pred) {                                 \
+    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                      \
+    uint8_t temp1[n * m];                                           \
+                                                                    \
+    if (n == 4) {                                                   \
+      var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2),     \
+                                bilinear_filters[xoffset]);         \
+      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,              \
+                                bilinear_filters[yoffset]);         \
+    } else if (n == 8) {                                            \
+      var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1),     \
+                                bilinear_filters[xoffset]);         \
+      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,              \
+                                bilinear_filters[yoffset]);         \
+    } else {                                                        \
+      var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
+                                 bilinear_filters[xoffset]);        \
+      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,          \
+                                 bilinear_filters[yoffset]);        \
+    }                                                               \
+                                                                    \
+    vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n);          \
+                                                                    \
+    return vpx_variance##n##x##m(temp0, n, b, b_stride, sse);       \
+  }
 
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
-                             bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
-                             bilinear_filters[yoffset]);
-  return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
-}
-
-unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
-                             bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
-                             bilinear_filters[yoffset]);
-  return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
-}
-
-unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
-                             bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
-                             bilinear_filters[yoffset]);
-  return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
-}
+sub_pixel_avg_varianceNxM(4, 4);
+sub_pixel_avg_varianceNxM(4, 8);
+sub_pixel_avg_varianceNxM(8, 4);
+sub_pixel_avg_varianceNxM(8, 8);
+sub_pixel_avg_varianceNxM(8, 16);
+sub_pixel_avg_varianceNxM(16, 8);
+sub_pixel_avg_varianceNxM(16, 16);
+sub_pixel_avg_varianceNxM(16, 32);
+sub_pixel_avg_varianceNxM(32, 16);
+sub_pixel_avg_varianceNxM(32, 32);
+sub_pixel_avg_varianceNxM(32, 64);
+sub_pixel_avg_varianceNxM(64, 32);
+sub_pixel_avg_varianceNxM(64, 64);
diff --git a/libs/libvpx/vpx_dsp/arm/sum_neon.h b/libs/libvpx/vpx_dsp/arm/sum_neon.h
new file mode 100644
index 0000000000..d74fe0cde4
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/sum_neon.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_SUM_NEON_H_
+#define VPX_DSP_ARM_SUM_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE int32x2_t horizontal_add_int16x8(const int16x8_t a) {
+  const int32x4_t b = vpaddlq_s16(a);
+  const int64x2_t c = vpaddlq_s32(b);
+  return vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
+                  vreinterpret_s32_s64(vget_high_s64(c)));
+}
+
+static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) {
+  const uint32x4_t b = vpaddlq_u16(a);
+  const uint64x2_t c = vpaddlq_u32(b);
+  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+                  vreinterpret_u32_u64(vget_high_u64(c)));
+}
+
+static INLINE uint32x2_t horizontal_add_long_uint16x8(const uint16x8_t a,
+                                                      const uint16x8_t b) {
+  const uint32x4_t c = vpaddlq_u16(a);
+  const uint32x4_t d = vpadalq_u16(c, b);
+  const uint64x2_t e = vpaddlq_u32(d);
+  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(e)),
+                  vreinterpret_u32_u64(vget_high_u64(e)));
+}
+
+static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
+  const uint64x2_t b = vpaddlq_u32(a);
+  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                  vreinterpret_u32_u64(vget_high_u64(b)));
+}
+#endif  // VPX_DSP_ARM_SUM_NEON_H_
diff --git a/libs/libvpx/vpx_dsp/arm/transpose_neon.h b/libs/libvpx/vpx_dsp/arm/transpose_neon.h
index 3d0b41f930..d85cbcee46 100644
--- a/libs/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/libs/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -21,7 +21,7 @@
 //
 // b0.val[0]: 00 01 02 03 16 17 18 19
 // b0.val[1]: 04 05 06 07 20 21 22 23
-static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
+static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
   int16x8x2_t b0;
   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
                            vreinterpret_s16_s32(vget_low_s32(a1)));
@@ -30,7 +30,23 @@ static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
   return b0;
 }
 
-static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) {
+static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+  int32x4x2_t b0;
+  b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+  b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+  return b0;
+}
+
+static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
+  int64x2x2_t b0;
+  b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
+                           vreinterpret_s64_s32(vget_low_s32(a1)));
+  b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
+                           vreinterpret_s64_s32(vget_high_s32(a1)));
+  return b0;
+}
+
+static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
   uint8x16x2_t b0;
   b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
                           vreinterpret_u8_u32(vget_low_u32(a1)));
@@ -39,6 +55,463 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) {
   return b0;
 }
 
+static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
+  uint16x8x2_t b0;
+  b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
+                           vreinterpret_u16_u32(vget_low_u32(a1)));
+  b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
+                           vreinterpret_u16_u32(vget_high_u32(a1)));
+  return b0;
+}
+
+static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03  10 11 12 13
+  // a1: 20 21 22 23  30 31 32 33
+  // to:
+  // b0.val[0]: 00 01 20 21  10 11 30 31
+  // b0.val[1]: 02 03 22 23  12 13 32 33
+
+  const uint16x4x2_t b0 =
+      vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21  02 03 22 23
+  // c0.val[1]: 10 11 30 31  12 13 32 33
+
+  const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+                                   vreinterpret_u32_u16(b0.val[1]));
+
+  // Swap 8 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30  02 12 22 32
+  // d0.val[1]: 01 11 21 31  03 13 23 33
+
+  const uint8x8x2_t d0 =
+      vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+}
+
+static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
+                                      int16x4_t *a2, int16x4_t *a3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+  const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                                  vreinterpret_s32_s16(b1.val[0]));
+  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                                  vreinterpret_s32_s16(b1.val[1]));
+
+  *a0 = vreinterpret_s16_s32(c0.val[0]);
+  *a1 = vreinterpret_s16_s32(c1.val[0]);
+  *a2 = vreinterpret_s16_s32(c0.val[1]);
+  *a3 = vreinterpret_s16_s32(c1.val[1]);
+}
+
+static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03  10 11 12 13
+  // a1: 20 21 22 23  30 31 32 33
+  // to:
+  // b0.val[0]: 00 01 20 21  10 11 30 31
+  // b0.val[1]: 02 03 22 23  12 13 32 33
+
+  const int32x4x2_t b0 =
+      vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1));
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21  02 03 22 23
+  // c0.val[1]: 10 11 30 31  12 13 32 33
+
+  const int32x4_t c0 =
+      vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1]));
+  const int32x4_t c1 =
+      vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1]));
+
+  // Swap 16 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30  02 12 22 32
+  // d0.val[1]: 01 11 21 31  03 13 23 33
+
+  const int16x8x2_t d0 =
+      vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+}
+
+static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03  10 11 12 13
+  // a1: 20 21 22 23  30 31 32 33
+  // to:
+  // b0.val[0]: 00 01 20 21  10 11 30 31
+  // b0.val[1]: 02 03 22 23  12 13 32 33
+
+  const uint32x4x2_t b0 =
+      vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1));
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21  02 03 22 23
+  // c0.val[1]: 10 11 30 31  12 13 32 33
+
+  const uint32x4_t c0 =
+      vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1]));
+  const uint32x4_t c1 =
+      vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1]));
+
+  // Swap 16 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30  02 12 22 32
+  // d0.val[1]: 01 11 21 31  03 13 23 33
+
+  const uint16x8x2_t d0 =
+      vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+}
+
+static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3, const uint8x8_t a4,
+                                    const uint8x8_t a5, const uint8x8_t a6,
+                                    const uint8x8_t a7) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03 XX XX XX XX
+  // a1: 10 11 12 13 XX XX XX XX
+  // a2: 20 21 22 23 XX XX XX XX
+  // a3; 30 31 32 33 XX XX XX XX
+  // a4: 40 41 42 43 XX XX XX XX
+  // a5: 50 51 52 53 XX XX XX XX
+  // a6: 60 61 62 63 XX XX XX XX
+  // a7: 70 71 72 73 XX XX XX XX
+  // to:
+  // b0.val[0]: 00 01 02 03 40 41 42 43
+  // b1.val[0]: 10 11 12 13 50 51 52 53
+  // b2.val[0]: 20 21 22 23 60 61 62 63
+  // b3.val[0]: 30 31 32 33 70 71 72 73
+
+  const uint32x2x2_t b0 =
+      vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
+  const uint32x2x2_t b1 =
+      vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
+  const uint32x2x2_t b2 =
+      vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
+  const uint32x2x2_t b3 =
+      vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21 40 41 60 61
+  // c0.val[1]: 02 03 22 23 42 43 62 63
+  // c1.val[0]: 10 11 30 31 50 51 70 71
+  // c1.val[1]: 12 13 32 33 52 53 72 73
+
+  const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
+                                   vreinterpret_u16_u32(b2.val[0]));
+  const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
+                                   vreinterpret_u16_u32(b3.val[0]));
+
+  // Swap 8 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 01 11 21 31 41 51 61 71
+  // d1.val[0]: 02 12 22 32 42 52 62 72
+  // d1.val[1]: 03 13 23 33 43 53 63 73
+
+  const uint8x8x2_t d0 =
+      vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
+  const uint8x8x2_t d1 =
+      vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+  *a2 = d1.val[0];
+  *a3 = d1.val[1];
+}
+
+static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+                                     int32x4_t *a2, int32x4_t *a3) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+  const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+  const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+  *a0 = c0.val[0];
+  *a1 = c1.val[0];
+  *a2 = c0.val[1];
+  *a3 = c1.val[1];
+}
+
+static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
+                                     const int16x4_t a2, const int16x4_t a3,
+                                     const int16x4_t a4, const int16x4_t a5,
+                                     const int16x4_t a6, const int16x4_t a7,
+                                     int16x8_t *const o0, int16x8_t *const o1,
+                                     int16x8_t *const o2, int16x8_t *const o3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  const int16x4x2_t b0 = vtrn_s16(a0, a1);
+  const int16x4x2_t b1 = vtrn_s16(a2, a3);
+  const int16x4x2_t b2 = vtrn_s16(a4, a5);
+  const int16x4x2_t b3 = vtrn_s16(a6, a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                                  vreinterpret_s32_s16(b1.val[0]));
+  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                                  vreinterpret_s32_s16(b1.val[1]));
+  const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
+                                  vreinterpret_s32_s16(b3.val[0]));
+  const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
+                                  vreinterpret_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // o0: 00 10 20 30 40 50 60 70
+  // o1: 01 11 21 31 41 51 61 71
+  // o2: 02 12 22 32 42 52 62 72
+  // o3: 03 13 23 33 43 53 63 73
+
+  *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
+                     vreinterpret_s16_s32(c2.val[0]));
+  *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
+                     vreinterpret_s16_s32(c3.val[0]));
+  *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
+                     vreinterpret_s16_s32(c2.val[1]));
+  *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
+                     vreinterpret_s16_s32(c3.val[1]));
+}
+
+static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,
+                                     int32x4_t *const a2, int32x4_t *const a3,
+                                     int32x4_t *const a4, int32x4_t *const a5,
+                                     int32x4_t *const a6, int32x4_t *const a7) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+  const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+  const int32x4x2_t b2 = vtrnq_s32(*a4, *a5);
+  const int32x4x2_t b3 = vtrnq_s32(*a6, *a7);
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b1.val[0]);
+  const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b1.val[1]);
+  const int64x2x2_t c2 = vpx_vtrnq_s64(b2.val[0], b3.val[0]);
+  const int64x2x2_t c3 = vpx_vtrnq_s64(b2.val[1], b3.val[1]);
+
+  *a0 = vreinterpretq_s32_s64(c0.val[0]);
+  *a1 = vreinterpretq_s32_s64(c2.val[0]);
+  *a2 = vreinterpretq_s32_s64(c1.val[0]);
+  *a3 = vreinterpretq_s32_s64(c3.val[0]);
+  *a4 = vreinterpretq_s32_s64(c0.val[1]);
+  *a5 = vreinterpretq_s32_s64(c2.val[1]);
+  *a6 = vreinterpretq_s32_s64(c1.val[1]);
+  *a7 = vreinterpretq_s32_s64(c3.val[1]);
+}
+
+static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3) {
+  // Swap 8 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+
+  const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+  const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+
+  const uint16x4x2_t c0 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+  const uint16x4x2_t c1 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+  *a0 = vreinterpret_u8_u16(c0.val[0]);
+  *a1 = vreinterpret_u8_u16(c1.val[0]);
+  *a2 = vreinterpret_u8_u16(c0.val[1]);
+  *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1,
+                                     uint16x8_t *a2, uint16x8_t *a3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+
+  const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+  const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+
+  *a0 = vreinterpretq_u16_u32(c0.val[0]);
+  *a1 = vreinterpretq_u16_u32(c1.val[0]);
+  *a2 = vreinterpretq_u16_u32(c0.val[1]);
+  *a3 = vreinterpretq_u16_u32(c1.val[1]);
+}
+
+static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
+                                     int32x4_t *const a2, int32x4_t *const a3,
+                                     int32x4_t *const a4, int32x4_t *const a5,
+                                     int32x4_t *const a6, int32x4_t *const a7) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 04 05 06 07
+  // a2: 10 11 12 13
+  // a3: 14 15 16 17
+  // a4: 20 21 22 23
+  // a5: 24 25 26 27
+  // a6: 30 31 32 33
+  // a7: 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 04 14 06 16
+  // b1.val[1]: 05 15 07 17
+  // b2.val[0]: 20 30 22 32
+  // b2.val[1]: 21 31 23 33
+  // b3.val[0]: 24 34 26 36
+  // b3.val[1]: 25 35 27 37
+
+  const int32x4x2_t b0 = vtrnq_s32(*a0, *a2);
+  const int32x4x2_t b1 = vtrnq_s32(*a1, *a3);
+  const int32x4x2_t b2 = vtrnq_s32(*a4, *a6);
+  const int32x4x2_t b3 = vtrnq_s32(*a5, *a7);
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 04 14 24 34
+  // c2.val[1]: 06 16 26 36
+  // c3.val[0]: 05 15 25 35
+  // c3.val[1]: 07 17 27 37
+
+  const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b2.val[0]);
+  const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b2.val[1]);
+  const int64x2x2_t c2 = vpx_vtrnq_s64(b1.val[0], b3.val[0]);
+  const int64x2x2_t c3 = vpx_vtrnq_s64(b1.val[1], b3.val[1]);
+
+  *a0 = vreinterpretq_s32_s64(c0.val[0]);
+  *a1 = vreinterpretq_s32_s64(c1.val[0]);
+  *a2 = vreinterpretq_s32_s64(c0.val[1]);
+  *a3 = vreinterpretq_s32_s64(c1.val[1]);
+  *a4 = vreinterpretq_s32_s64(c2.val[0]);
+  *a5 = vreinterpretq_s32_s64(c3.val[0]);
+  *a6 = vreinterpretq_s32_s64(c2.val[1]);
+  *a7 = vreinterpretq_s32_s64(c3.val[1]);
+}
+
 // Note: Using 'd' registers or 'q' registers has almost identical speed. We use
 // 'q' registers here to save some instructions.
 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
@@ -151,10 +624,10 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
   // d2.val[1]: 06 16 26 36 46 56 66 76
   // d3.val[0]: 03 13 23 33 43 53 63 73
   // d3.val[1]: 07 17 27 37 47 57 67 77
-  const int16x8x2_t d0 = vpx_vtrnq_s64(c0.val[0], c2.val[0]);
-  const int16x8x2_t d1 = vpx_vtrnq_s64(c1.val[0], c3.val[0]);
-  const int16x8x2_t d2 = vpx_vtrnq_s64(c0.val[1], c2.val[1]);
-  const int16x8x2_t d3 = vpx_vtrnq_s64(c1.val[1], c3.val[1]);
+  const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
 
   *a0 = d0.val[0];
   *a1 = d1.val[0];
@@ -166,6 +639,154 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
   *a7 = d3.val[1];
 }
 
+static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
+                                     uint16x8_t *a2, uint16x8_t *a3,
+                                     uint16x8_t *a4, uint16x8_t *a5,
+                                     uint16x8_t *a6, uint16x8_t *a7) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+  const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+  const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
+  const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+  const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+                                    vreinterpretq_u32_u16(b3.val[0]));
+  const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+                                    vreinterpretq_u32_u16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+  const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
+  const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
+  const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
+  const uint16x8x2_t d3 = vpx_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
+
+  *a0 = d0.val[0];
+  *a1 = d1.val[0];
+  *a2 = d2.val[0];
+  *a3 = d3.val[0];
+  *a4 = d0.val[1];
+  *a5 = d1.val[1];
+  *a6 = d2.val[1];
+  *a7 = d3.val[1];
+}
+
+static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
+                                     int32x4x2_t *a2, int32x4x2_t *a3,
+                                     int32x4x2_t *a4, int32x4x2_t *a5,
+                                     int32x4x2_t *a6, int32x4x2_t *a7) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0: 00 10 02 12 01 11 03 13
+  // b1: 20 30 22 32 21 31 23 33
+  // b2: 40 50 42 52 41 51 43 53
+  // b3: 60 70 62 72 61 71 63 73
+  // b4: 04 14 06 16 05 15 07 17
+  // b5: 24 34 26 36 25 35 27 37
+  // b6: 44 54 46 56 45 55 47 57
+  // b7: 64 74 66 76 65 75 67 77
+
+  const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]);
+  const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]);
+  const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]);
+  const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]);
+  const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]);
+  const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]);
+  const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]);
+  const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]);
+
+  // Swap 64 bit elements resulting in:
+  // c0: 00 10 20 30 02 12 22 32
+  // c1: 01 11 21 31 03 13 23 33
+  // c2: 40 50 60 70 42 52 62 72
+  // c3: 41 51 61 71 43 53 63 73
+  // c4: 04 14 24 34 06 16 26 36
+  // c5: 05 15 25 35 07 17 27 37
+  // c6: 44 54 64 74 46 56 66 76
+  // c7: 45 55 65 75 47 57 67 77
+  const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+  const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+  const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]);
+  const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]);
+  const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]);
+  const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]);
+  const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]);
+  const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]);
+
+  // Swap 128 bit elements resulting in:
+  // a0: 00 10 20 30 40 50 60 70
+  // a1: 01 11 21 31 41 51 61 71
+  // a2: 02 12 22 32 42 52 62 72
+  // a3: 03 13 23 33 43 53 63 73
+  // a4: 04 14 24 34 44 54 64 74
+  // a5: 05 15 25 35 45 55 65 75
+  // a6: 06 16 26 36 46 56 66 76
+  // a7: 07 17 27 37 47 57 67 77
+  a0->val[0] = c0.val[0];
+  a0->val[1] = c2.val[0];
+  a1->val[0] = c1.val[0];
+  a1->val[1] = c3.val[0];
+  a2->val[0] = c0.val[1];
+  a2->val[1] = c2.val[1];
+  a3->val[0] = c1.val[1];
+  a3->val[1] = c3.val[1];
+  a4->val[0] = c4.val[0];
+  a4->val[1] = c6.val[0];
+  a5->val[0] = c5.val[0];
+  a5->val[1] = c7.val[0];
+  a6->val[0] = c4.val[1];
+  a6->val[1] = c6.val[1];
+  a7->val[0] = c5.val[1];
+  a7->val[1] = c7.val[1];
+}
+
 static INLINE void transpose_u8_16x8(
     const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
     const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
@@ -519,14 +1140,14 @@ static INLINE void transpose_u8_16x16(
   // e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E  8E 9E AE BE CE DE EE FE
   // e7.val[0]: 07 17 27 37 47 57 67 77  87 97 A7 B7 C7 D7 E7 F7
   // e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F  8F 9F AF BF CF DF EF FF
-  const uint8x16x2_t e0 = vpx_vtrnq_u64(d0.val[0], d4.val[0]);
-  const uint8x16x2_t e1 = vpx_vtrnq_u64(d2.val[0], d6.val[0]);
-  const uint8x16x2_t e2 = vpx_vtrnq_u64(d1.val[0], d5.val[0]);
-  const uint8x16x2_t e3 = vpx_vtrnq_u64(d3.val[0], d7.val[0]);
-  const uint8x16x2_t e4 = vpx_vtrnq_u64(d0.val[1], d4.val[1]);
-  const uint8x16x2_t e5 = vpx_vtrnq_u64(d2.val[1], d6.val[1]);
-  const uint8x16x2_t e6 = vpx_vtrnq_u64(d1.val[1], d5.val[1]);
-  const uint8x16x2_t e7 = vpx_vtrnq_u64(d3.val[1], d7.val[1]);
+  const uint8x16x2_t e0 = vpx_vtrnq_u64_to_u8(d0.val[0], d4.val[0]);
+  const uint8x16x2_t e1 = vpx_vtrnq_u64_to_u8(d2.val[0], d6.val[0]);
+  const uint8x16x2_t e2 = vpx_vtrnq_u64_to_u8(d1.val[0], d5.val[0]);
+  const uint8x16x2_t e3 = vpx_vtrnq_u64_to_u8(d3.val[0], d7.val[0]);
+  const uint8x16x2_t e4 = vpx_vtrnq_u64_to_u8(d0.val[1], d4.val[1]);
+  const uint8x16x2_t e5 = vpx_vtrnq_u64_to_u8(d2.val[1], d6.val[1]);
+  const uint8x16x2_t e6 = vpx_vtrnq_u64_to_u8(d1.val[1], d5.val[1]);
+  const uint8x16x2_t e7 = vpx_vtrnq_u64_to_u8(d3.val[1], d7.val[1]);
 
   // Output:
   // o0 : 00 10 20 30 40 50 60 70  80 90 A0 B0 C0 D0 E0 F0
@@ -563,4 +1184,133 @@ static INLINE void transpose_u8_16x16(
   *o15 = e7.val[1];
 }
 
+static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
+                                             const int a_stride, uint8x8_t *a0,
+                                             uint8x8_t *a1, uint8x8_t *a2,
+                                             uint8x8_t *a3) {
+  uint8x8_t a4, a5, a6, a7;
+  *a0 = vld1_u8(a);
+  a += a_stride;
+  *a1 = vld1_u8(a);
+  a += a_stride;
+  *a2 = vld1_u8(a);
+  a += a_stride;
+  *a3 = vld1_u8(a);
+  a += a_stride;
+  a4 = vld1_u8(a);
+  a += a_stride;
+  a5 = vld1_u8(a);
+  a += a_stride;
+  a6 = vld1_u8(a);
+  a += a_stride;
+  a7 = vld1_u8(a);
+
+  transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+static INLINE void load_and_transpose_u8_8x8(const uint8_t *a,
+                                             const int a_stride, uint8x8_t *a0,
+                                             uint8x8_t *a1, uint8x8_t *a2,
+                                             uint8x8_t *a3, uint8x8_t *a4,
+                                             uint8x8_t *a5, uint8x8_t *a6,
+                                             uint8x8_t *a7) {
+  *a0 = vld1_u8(a);
+  a += a_stride;
+  *a1 = vld1_u8(a);
+  a += a_stride;
+  *a2 = vld1_u8(a);
+  a += a_stride;
+  *a3 = vld1_u8(a);
+  a += a_stride;
+  *a4 = vld1_u8(a);
+  a += a_stride;
+  *a5 = vld1_u8(a);
+  a += a_stride;
+  *a6 = vld1_u8(a);
+  a += a_stride;
+  *a7 = vld1_u8(a);
+
+  transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride,
+                                              uint8x8_t a0, uint8x8_t a1,
+                                              uint8x8_t a2, uint8x8_t a3,
+                                              uint8x8_t a4, uint8x8_t a5,
+                                              uint8x8_t a6, uint8x8_t a7) {
+  transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  vst1_u8(a, a0);
+  a += a_stride;
+  vst1_u8(a, a1);
+  a += a_stride;
+  vst1_u8(a, a2);
+  a += a_stride;
+  vst1_u8(a, a3);
+  a += a_stride;
+  vst1_u8(a, a4);
+  a += a_stride;
+  vst1_u8(a, a5);
+  a += a_stride;
+  vst1_u8(a, a6);
+  a += a_stride;
+  vst1_u8(a, a7);
+}
+
+static INLINE void load_and_transpose_s16_8x8(const int16_t *a,
+                                              const int a_stride, int16x8_t *a0,
+                                              int16x8_t *a1, int16x8_t *a2,
+                                              int16x8_t *a3, int16x8_t *a4,
+                                              int16x8_t *a5, int16x8_t *a6,
+                                              int16x8_t *a7) {
+  *a0 = vld1q_s16(a);
+  a += a_stride;
+  *a1 = vld1q_s16(a);
+  a += a_stride;
+  *a2 = vld1q_s16(a);
+  a += a_stride;
+  *a3 = vld1q_s16(a);
+  a += a_stride;
+  *a4 = vld1q_s16(a);
+  a += a_stride;
+  *a5 = vld1q_s16(a);
+  a += a_stride;
+  *a6 = vld1q_s16(a);
+  a += a_stride;
+  *a7 = vld1q_s16(a);
+
+  transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+static INLINE void load_and_transpose_s32_8x8(
+    const int32_t *a, const int a_stride, int32x4x2_t *const a0,
+    int32x4x2_t *const a1, int32x4x2_t *const a2, int32x4x2_t *const a3,
+    int32x4x2_t *const a4, int32x4x2_t *const a5, int32x4x2_t *const a6,
+    int32x4x2_t *const a7) {
+  a0->val[0] = vld1q_s32(a);
+  a0->val[1] = vld1q_s32(a + 4);
+  a += a_stride;
+  a1->val[0] = vld1q_s32(a);
+  a1->val[1] = vld1q_s32(a + 4);
+  a += a_stride;
+  a2->val[0] = vld1q_s32(a);
+  a2->val[1] = vld1q_s32(a + 4);
+  a += a_stride;
+  a3->val[0] = vld1q_s32(a);
+  a3->val[1] = vld1q_s32(a + 4);
+  a += a_stride;
+  a4->val[0] = vld1q_s32(a);
+  a4->val[1] = vld1q_s32(a + 4);
+  a += a_stride;
+  a5->val[0] = vld1q_s32(a);
+  a5->val[1] = vld1q_s32(a + 4);
+  a += a_stride;
+  a6->val[0] = vld1q_s32(a);
+  a6->val[1] = vld1q_s32(a + 4);
+  a += a_stride;
+  a7->val[0] = vld1q_s32(a);
+  a7->val[1] = vld1q_s32(a + 4);
+
+  transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
 #endif  // VPX_DSP_ARM_TRANSPOSE_NEON_H_
diff --git a/libs/libvpx/vpx_dsp/arm/variance_neon.c b/libs/libvpx/vpx_dsp/arm/variance_neon.c
index b6d7f86a4b..61c2c16a72 100644
--- a/libs/libvpx/vpx_dsp/arm/variance_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/variance_neon.c
@@ -9,99 +9,204 @@
  */
 
 #include <arm_neon.h>
+#include <assert.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 #include "vpx_ports/mem.h"
 
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
-  const int32x4_t a = vpaddlq_s16(v_16x8);
-  const int64x2_t b = vpaddlq_s32(a);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
+// The variance helper functions use int16_t for sum. 8 values are accumulated
+// and then added (at which point they expand up to int32_t). To avoid overflow,
+// there can be no more than 32767 / 255 ~= 128 values accumulated in each
+// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32
+// rows = 128. Asserts have been added to each function to warn against reaching
+// this limit.
+
+// Process a block of width 4 four rows at a time.
+static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int h, uint32_t *sse, int *sum) {
+  int i;
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
+  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+
+  // Since width is only 4, sum_s16 only loads a half row per loop.
+  assert(h <= 256);
+
+  for (i = 0; i < h; i += 4) {
+    const uint8x16_t a_u8 = load_unaligned_u8q(a, a_stride);
+    const uint8x16_t b_u8 = load_unaligned_u8q(b, b_stride);
+    const uint16x8_t diff_lo_u16 =
+        vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
+    const uint16x8_t diff_hi_u16 =
+        vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
+
+    const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
+    const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
+
+    sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
+    sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+
+    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
+                           vget_low_s16(diff_lo_s16));
+    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
+                           vget_high_s16(diff_lo_s16));
+
+    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
+                           vget_low_s16(diff_hi_s16));
+    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
+                           vget_high_s16(diff_hi_s16));
+
+    a += 4 * a_stride;
+    b += 4 * b_stride;
+  }
+
+  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
+  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
+                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
+                       0);
 }
 
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
-  const int64x2_t b = vpaddlq_s32(v_32x4);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
-// w * h must be less than 2048 or local variable v_sum may overflow.
-static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, uint32_t *sse,
-                             int *sum) {
+// Process a block of any size where the width is divisible by 16.
+static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int w, int h, uint32_t *sse,
+                              int *sum) {
   int i, j;
-  int16x8_t v_sum = vdupq_n_s16(0);
-  int32x4_t v_sse_lo = vdupq_n_s32(0);
-  int32x4_t v_sse_hi = vdupq_n_s32(0);
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
+  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+
+  // The loop loads 16 values at a time but doubles them up when accumulating
+  // into sum_s16.
+  assert(w / 8 * h <= 128);
 
   for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 8) {
-      const uint8x8_t v_a = vld1_u8(&a[j]);
-      const uint8x8_t v_b = vld1_u8(&b[j]);
-      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
-      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
-      v_sum = vaddq_s16(v_sum, sv_diff);
-      v_sse_lo =
-          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
-      v_sse_hi =
-          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
+    for (j = 0; j < w; j += 16) {
+      const uint8x16_t a_u8 = vld1q_u8(a + j);
+      const uint8x16_t b_u8 = vld1q_u8(b + j);
+
+      const uint16x8_t diff_lo_u16 =
+          vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
+      const uint16x8_t diff_hi_u16 =
+          vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
+
+      const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
+      const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
+
+      sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
+      sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+
+      sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
+                             vget_low_s16(diff_lo_s16));
+      sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
+                             vget_high_s16(diff_lo_s16));
+
+      sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
+                             vget_low_s16(diff_hi_s16));
+      sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
+                             vget_high_s16(diff_hi_s16));
     }
     a += a_stride;
     b += b_stride;
   }
 
-  *sum = horizontal_add_s16x8(v_sum);
-  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
+  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
+                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
+                       0);
+}
+
+// Process a block of width 8 two rows at a time.
+static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int h, uint32_t *sse, int *sum) {
+  int i = 0;
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
+  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+
+  // Each column has it's own accumulator entry in sum_s16.
+  assert(h <= 128);
+
+  do {
+    const uint8x8_t a_0_u8 = vld1_u8(a);
+    const uint8x8_t a_1_u8 = vld1_u8(a + a_stride);
+    const uint8x8_t b_0_u8 = vld1_u8(b);
+    const uint8x8_t b_1_u8 = vld1_u8(b + b_stride);
+    const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8);
+    const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8);
+    const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16);
+    const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16);
+    sum_s16 = vaddq_s16(sum_s16, diff_0_s16);
+    sum_s16 = vaddq_s16(sum_s16, diff_1_s16);
+    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16),
+                           vget_low_s16(diff_0_s16));
+    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16),
+                           vget_low_s16(diff_1_s16));
+    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16),
+                           vget_high_s16(diff_0_s16));
+    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16),
+                           vget_high_s16(diff_1_s16));
+    a += a_stride + a_stride;
+    b += b_stride + b_stride;
+    i += 2;
+  } while (i < h);
+
+  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
+  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
+                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
+                       0);
 }
 
 void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
                         int b_stride, unsigned int *sse, int *sum) {
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+  variance_neon_w8x2(a, a_stride, b, b_stride, 8, sse, sum);
 }
 
 void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
                           int b_stride, unsigned int *sse, int *sum) {
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
+  variance_neon_w16(a, a_stride, b, b_stride, 16, 16, sse, sum);
 }
 
-unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
-                                  const uint8_t *b, int b_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
-  return *sse - ((sum * sum) >> 6);
-}
+#define varianceNxM(n, m, shift)                                            \
+  unsigned int vpx_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \
+                                            const uint8_t *b, int b_stride, \
+                                            unsigned int *sse) {            \
+    int sum;                                                                \
+    if (n == 4)                                                             \
+      variance_neon_w4x4(a, a_stride, b, b_stride, m, sse, &sum);           \
+    else if (n == 8)                                                        \
+      variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum);           \
+    else                                                                    \
+      variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum);         \
+    if (n * m < 16 * 16)                                                    \
+      return *sse - ((sum * sum) >> shift);                                 \
+    else                                                                    \
+      return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);              \
+  }
 
-unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
-  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
-}
-
-unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
+varianceNxM(4, 4, 4);
+varianceNxM(4, 8, 5);
+varianceNxM(8, 4, 5);
+varianceNxM(8, 8, 6);
+varianceNxM(8, 16, 7);
+varianceNxM(16, 8, 7);
+varianceNxM(16, 16, 8);
+varianceNxM(16, 32, 9);
+varianceNxM(32, 16, 9);
+varianceNxM(32, 32, 10);
 
 unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
                                     const uint8_t *b, int b_stride,
                                     unsigned int *sse) {
   int sum1, sum2;
   uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
-  variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
-                   32, 32, &sse2, &sum2);
+  variance_neon_w16(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+  variance_neon_w16(a + (32 * a_stride), a_stride, b + (32 * b_stride),
+                    b_stride, 32, 32, &sse2, &sum2);
   *sse = sse1 + sse2;
   sum1 += sum2;
   return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
@@ -112,9 +217,9 @@ unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
                                     unsigned int *sse) {
   int sum1, sum2;
   uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
+  variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride),
+                    b_stride, 64, 16, &sse2, &sum2);
   *sse = sse1 + sse2;
   sum1 += sum2;
   return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
@@ -126,162 +231,24 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
   int sum1, sum2;
   uint32_t sse1, sse2;
 
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
+  variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride),
+                    b_stride, 64, 16, &sse2, &sum2);
   sse1 += sse2;
   sum1 += sum2;
 
-  variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
-                   b_stride, 64, 16, &sse2, &sum2);
+  variance_neon_w16(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
+                    b_stride, 64, 16, &sse2, &sum2);
   sse1 += sse2;
   sum1 += sum2;
 
-  variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
-                   b_stride, 64, 16, &sse2, &sum2);
+  variance_neon_w16(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
+                    b_stride, 64, 16, &sse2, &sum2);
   *sse = sse1 + sse2;
   sum1 += sum2;
   return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
 }
 
-unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride, unsigned int *sse) {
-  int i;
-  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-  uint32x2_t d0u32, d10u32;
-  int64x1_t d0s64, d1s64;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int32x4_t q8s32, q9s32, q10s32;
-  int64x2_t q0s64, q1s64, q5s64;
-
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 4; i++) {
-    q0u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q1u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    __builtin_prefetch(src_ptr);
-
-    q2u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    q3u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    __builtin_prefetch(ref_ptr);
-
-    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-    q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-    q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-  }
-
-  q10s32 = vaddq_s32(q10s32, q9s32);
-  q0s64 = vpaddlq_s32(q8s32);
-  q1s64 = vpaddlq_s32(q10s32);
-
-  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-  return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride, unsigned int *sse) {
-  int i;
-  uint8x8_t d0u8, d2u8, d4u8, d6u8;
-  int16x4_t d22s16, d23s16, d24s16, d25s16;
-  uint32x2_t d0u32, d10u32;
-  int64x1_t d0s64, d1s64;
-  uint16x8_t q11u16, q12u16;
-  int32x4_t q8s32, q9s32, q10s32;
-  int64x2_t q0s64, q1s64, q5s64;
-
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 8; i++) {
-    d0u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d2u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    __builtin_prefetch(src_ptr);
-
-    d4u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    d6u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    __builtin_prefetch(ref_ptr);
-
-    q11u16 = vsubl_u8(d0u8, d4u8);
-    q12u16 = vsubl_u8(d2u8, d6u8);
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-  }
-
-  q10s32 = vaddq_s32(q10s32, q9s32);
-  q0s64 = vpaddlq_s32(q8s32);
-  q1s64 = vpaddlq_s32(q10s32);
-
-  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-  return vget_lane_u32(d0u32, 0);
-}
-
 unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
                                const unsigned char *ref_ptr, int recon_stride,
                                unsigned int *sse) {
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c
deleted file mode 100644
index 69cb284005..0000000000
--- a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
-                                       int16x4_t dsrc2, int16x4_t dsrc3,
-                                       int16x4_t dsrc4, int16x4_t dsrc5,
-                                       int16x4_t dsrc6, int16x4_t dsrc7,
-                                       int16x8_t q0s16) {
-  int32x4_t qdst;
-  int16x4_t d0s16, d1s16;
-
-  d0s16 = vget_low_s16(q0s16);
-  d1s16 = vget_high_s16(q0s16);
-
-  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
-  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
-  return qdst;
-}
-
-void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y,  // unused
-                                  int y_step_q4,            // unused
-                                  int w, int h) {
-  int width;
-  const uint8_t *s;
-  uint8_t *d;
-  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
-  uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
-  uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-  uint16x8x2_t q0x2u16;
-  uint8x8x2_t d0x2u8, d1x2u8;
-  uint32x2x2_t d0x2u32;
-  uint16x4x2_t d0x2u16, d1x2u16;
-  uint32x4x2_t q0x2u32;
-
-  assert(x_step_q4 == 16);
-
-  q0s16 = vld1q_s16(filter_x);
-
-  src -= 3;                // adjust for taps
-  for (; h > 0; h -= 4) {  // loop_horiz_v
-    s = src;
-    d24u8 = vld1_u8(s);
-    s += src_stride;
-    d25u8 = vld1_u8(s);
-    s += src_stride;
-    d26u8 = vld1_u8(s);
-    s += src_stride;
-    d27u8 = vld1_u8(s);
-
-    q12u8 = vcombine_u8(d24u8, d25u8);
-    q13u8 = vcombine_u8(d26u8, d27u8);
-
-    q0x2u16 =
-        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
-    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
-    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
-    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
-    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
-    d0x2u8 = vtrn_u8(d24u8, d25u8);
-    d1x2u8 = vtrn_u8(d26u8, d27u8);
-
-    __builtin_prefetch(src + src_stride * 4);
-    __builtin_prefetch(src + src_stride * 5);
-
-    q8u16 = vmovl_u8(d0x2u8.val[0]);
-    q9u16 = vmovl_u8(d0x2u8.val[1]);
-    q10u16 = vmovl_u8(d1x2u8.val[0]);
-    q11u16 = vmovl_u8(d1x2u8.val[1]);
-
-    src += 7;
-    d16u16 = vget_low_u16(q8u16);
-    d17u16 = vget_high_u16(q8u16);
-    d18u16 = vget_low_u16(q9u16);
-    d19u16 = vget_high_u16(q9u16);
-    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
-    q9u16 = vcombine_u16(d17u16, d19u16);
-
-    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));         // vmov 23 21
-    for (width = w; width > 0; width -= 4, src += 4, dst += 4) {  // loop_horiz
-      s = src;
-      d28u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d29u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d31u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d30u32 = vld1_dup_u32((const uint32_t *)s);
-
-      __builtin_prefetch(src + 64);
-
-      d0x2u16 =
-          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
-      d1x2u16 =
-          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
-                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
-      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
-                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
-
-      __builtin_prefetch(src + 64 + src_stride);
-
-      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
-      q0x2u32 =
-          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
-
-      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
-      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
-      q12u16 = vmovl_u8(d28u8);
-      q13u16 = vmovl_u8(d29u8);
-
-      __builtin_prefetch(src + 64 + src_stride * 2);
-
-      d = dst;
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
-      d += dst_stride;
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
-
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
-                             d23s16, d24s16, q0s16);
-      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
-                             d24s16, d26s16, q0s16);
-      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
-                              d26s16, d27s16, q0s16);
-      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
-                              d27s16, d25s16, q0s16);
-
-      __builtin_prefetch(src + 64 + src_stride * 3);
-
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u8 = vqmovn_u16(q1u16);
-      d3u8 = vqmovn_u16(q2u16);
-
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
-      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
-                         vreinterpret_u32_u16(d0x2u16.val[1]));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
-                       vreinterpret_u8_u32(d0x2u32.val[1]));
-
-      q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
-      q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
-      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
-      d = dst;
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
-      q8u16 = q9u16;
-      d20s16 = d23s16;
-      q11u16 = q12u16;
-      q9u16 = q13u16;
-      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    }
-    src += src_stride * 4 - w - 7;
-    dst += dst_stride * 4 - w;
-  }
-  return;
-}
-
-void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x,  // unused
-                                 int x_step_q4,            // unused
-                                 const int16_t *filter_y, int y_step_q4, int w,
-                                 int h) {
-  int height;
-  const uint8_t *s;
-  uint8_t *d;
-  uint8x8_t d2u8, d3u8;
-  uint32x2_t d2u32, d3u32, d6u32, d7u32;
-  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
-  uint8x16_t q1u8, q3u8;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-
-  assert(y_step_q4 == 16);
-
-  src -= src_stride * 3;
-  q0s16 = vld1q_s16(filter_y);
-  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
-    s = src;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
-    s += src_stride;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
-    s += src_stride;
-    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
-    s += src_stride;
-
-    q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
-    q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
-    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
-    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
-    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d = dst;
-    for (height = h; height > 0; height -= 4) {  // loop_vert
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
-      s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
-      s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
-      s += src_stride;
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
-      s += src_stride;
-
-      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
-      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
-      d += dst_stride;
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
-      d -= dst_stride * 3;
-
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-      __builtin_prefetch(s);
-      __builtin_prefetch(s + src_stride);
-      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
-                             d22s16, d24s16, q0s16);
-      __builtin_prefetch(s + src_stride * 2);
-      __builtin_prefetch(s + src_stride * 3);
-      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
-                             d24s16, d26s16, q0s16);
-      __builtin_prefetch(d);
-      __builtin_prefetch(d + dst_stride);
-      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
-                              d26s16, d27s16, q0s16);
-      __builtin_prefetch(d + dst_stride * 2);
-      __builtin_prefetch(d + dst_stride * 3);
-      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
-                              d27s16, d25s16, q0s16);
-
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u8 = vqmovn_u16(q1u16);
-      d3u8 = vqmovn_u16(q2u16);
-
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
-      q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
-      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-      d += dst_stride;
-
-      q8u16 = q10u16;
-      d18s16 = d22s16;
-      d19s16 = d24s16;
-      q10u16 = q13u16;
-      d22s16 = d25s16;
-    }
-  }
-  return;
-}
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
index e279d570fc..1c2ee50630 100644
--- a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
@@ -42,10 +42,11 @@
 ; r1    int src_stride
 ; r2    uint8_t *dst
 ; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
+; sp[]const int16_t *filter
+; sp[]int x0_q4
+; sp[]int x_step_q4 ; unused
+; sp[]int y0_q4
+; sp[]int y_step_q4 ; unused
 ; sp[]int w
 ; sp[]int h
 
@@ -54,11 +55,11 @@
 
     sub             r0, r0, #3              ; adjust for taps
 
-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
+    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4
+    add             r4, r5, lsl #4
+    ldrd            r6, r7, [sp, #52]       ; w, h
 
-    vld1.s16        {q0}, [r5]              ; filter_x
+    vld1.s16        {q0}, [r4]              ; filter
 
     sub             r8, r1, r1, lsl #2      ; -src_stride * 3
     add             r8, r8, #4              ; -src_stride * 3 + 4
@@ -127,7 +128,7 @@ vpx_convolve8_avg_loop_horiz
 
     sub             r2, r2, r3, lsl #2      ; reset for store
 
-    ; src[] * filter_x
+    ; src[] * filter
     MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
     MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
     MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
@@ -184,11 +185,13 @@ vpx_convolve8_avg_loop_horiz
     sub             r0, r0, r1
     sub             r0, r0, r1, lsl #1
 
-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
+    ldr             r4, [sp, #24]           ; filter
+    ldr             r5, [sp, #36]           ; y0_q4
+    add             r4, r5, lsl #4
+    ldr             r6, [sp, #44]           ; w
+    ldr             lr, [sp, #48]           ; h
 
-    vld1.s16        {q0}, [r4]              ; filter_y
+    vld1.s16        {q0}, [r4]              ; filter
 
     lsl             r1, r1, #1
     lsl             r3, r3, #1
@@ -232,7 +235,7 @@ vpx_convolve8_avg_loop_vert
     pld             [r7]
     pld             [r4]
 
-    ; src[] * filter_y
+    ; src[] * filter
     MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
 
     pld             [r7, r1]
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
index 514525696b..08ae17dbab 100644
--- a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -14,309 +14,946 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
 #include "vpx_ports/mem.h"
 
-static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
-                                       int16x4_t dsrc2, int16x4_t dsrc3,
-                                       int16x4_t dsrc4, int16x4_t dsrc5,
-                                       int16x4_t dsrc6, int16x4_t dsrc7,
-                                       int16x8_t q0s16) {
-  int32x4_t qdst;
-  int16x4_t d0s16, d1s16;
+// Note:
+// 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src).
+// 2. After refactoring the shared code in kernel loops with inline functions,
+// the decoder speed dropped a lot when using gcc compiler. Therefore there is
+// no refactoring for those parts by now.
+// 3. For horizontal convolve, there is an alternative optimization that
+// convolves a single row in each loop. For each row, 8 sample banks with 4 or 8
+// samples in each are read from memory: src, (src+1), (src+2), (src+3),
+// (src+4), (src+5), (src+6), (src+7), or prepared by vector extract
+// instructions. This optimization is much faster in speed unit test, but slowed
+// down the whole decoder by 5%.
 
-  d0s16 = vget_low_s16(q0s16);
-  d1s16 = vget_high_s16(q0s16);
-
-  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
-  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
-  return qdst;
+static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2, const uint8x8_t s3,
+                                const uint8x8_t s4, const uint8x8_t s5,
+                                const uint8x8_t s6, const uint8x8_t s7) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+  s += p;
+  vst1_u8(s, s4);
+  s += p;
+  vst1_u8(s, s5);
+  s += p;
+  vst1_u8(s, s6);
+  s += p;
+  vst1_u8(s, s7);
 }
 
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y,  // unused
-                              int y_step_q4,            // unused
-                              int w, int h) {
-  int width;
-  const uint8_t *s, *psrc;
-  uint8_t *d, *pdst;
-  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
-  uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
-  uint8x16_t q12u8, q13u8, q14u8, q15u8;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-  uint16x8x2_t q0x2u16;
-  uint8x8x2_t d0x2u8, d1x2u8;
-  uint32x2x2_t d0x2u32;
-  uint16x4x2_t d0x2u16, d1x2u16;
-  uint32x4x2_t q0x2u32;
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+  uint8x8_t t0, t1, t2, t3;
 
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
   assert(x_step_q4 == 16);
 
-  q0s16 = vld1q_s16(filter_x);
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
-  src -= 3;  // adjust for taps
-  for (; h > 0; h -= 4, src += src_stride * 4,
-                dst += dst_stride * 4) {  // loop_horiz_v
-    s = src;
-    d24u8 = vld1_u8(s);
-    s += src_stride;
-    d25u8 = vld1_u8(s);
-    s += src_stride;
-    d26u8 = vld1_u8(s);
-    s += src_stride;
-    d27u8 = vld1_u8(s);
+  src -= 3;
 
-    q12u8 = vcombine_u8(d24u8, d25u8);
-    q13u8 = vcombine_u8(d26u8, d27u8);
+  if (h == 4) {
+    uint8x8_t d01, d23;
+    int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
+        d1, d2, d3;
+    int16x8_t tt0, tt1, tt2, tt3;
 
-    q0x2u16 =
-        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
-    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
-    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
-    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
-    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
-    d0x2u8 = vtrn_u8(d24u8, d25u8);
-    d1x2u8 = vtrn_u8(d26u8, d27u8);
+    __builtin_prefetch(src + 0 * src_stride);
+    __builtin_prefetch(src + 1 * src_stride);
+    __builtin_prefetch(src + 2 * src_stride);
+    __builtin_prefetch(src + 3 * src_stride);
+    filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
+    filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+    tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+    tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+    tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+    tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+    s0 = vget_low_s16(tt0);
+    s1 = vget_low_s16(tt1);
+    s2 = vget_low_s16(tt2);
+    s3 = vget_low_s16(tt3);
+    s4 = vget_high_s16(tt0);
+    s5 = vget_high_s16(tt1);
+    s6 = vget_high_s16(tt2);
+    __builtin_prefetch(dst + 0 * dst_stride);
+    __builtin_prefetch(dst + 1 * dst_stride);
+    __builtin_prefetch(dst + 2 * dst_stride);
+    __builtin_prefetch(dst + 3 * dst_stride);
+    src += 7;
 
-    __builtin_prefetch(src + src_stride * 4);
-    __builtin_prefetch(src + src_stride * 5);
-    __builtin_prefetch(src + src_stride * 6);
+    do {
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s7 = vget_low_s16(tt0);
+      s8 = vget_low_s16(tt1);
+      s9 = vget_low_s16(tt2);
+      s10 = vget_low_s16(tt3);
 
-    q8u16 = vmovl_u8(d0x2u8.val[0]);
-    q9u16 = vmovl_u8(d0x2u8.val[1]);
-    q10u16 = vmovl_u8(d1x2u8.val[0]);
-    q11u16 = vmovl_u8(d1x2u8.val[1]);
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+                       filter4);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+                       filter4);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+                       filter4);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+                       filter4);
 
-    d16u16 = vget_low_u16(q8u16);
-    d17u16 = vget_high_u16(q8u16);
-    d18u16 = vget_low_u16(q9u16);
-    d19u16 = vget_high_u16(q9u16);
-    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
-    q9u16 = vcombine_u16(d17u16, d19u16);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+      transpose_u8_4x4(&d01, &d23);
 
-    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
-    for (width = w, psrc = src + 7, pdst = dst; width > 0;
-         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
-      s = psrc;
-      d28u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d29u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d31u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d30u32 = vld1_dup_u32((const uint32_t *)s);
+      vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
+                    vreinterpret_u32_u8(d01), 0);
+      vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
+                    vreinterpret_u32_u8(d23), 0);
+      vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
+                    vreinterpret_u32_u8(d01), 1);
+      vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
+                    vreinterpret_u32_u8(d23), 1);
 
-      __builtin_prefetch(psrc + 64);
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      src += 4;
+      dst += 4;
+      w -= 4;
+    } while (w > 0);
+  } else {
+    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
+    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
+    int width;
+    const uint8_t *s;
+    uint8x8_t t4, t5, t6, t7;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
-      d0x2u16 =
-          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
-      d1x2u16 =
-          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
-                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
-      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
-                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
+    if (w == 4) {
+      do {
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
-      __builtin_prefetch(psrc + 64 + src_stride);
+        load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                    &t7);
+        src += 8 * src_stride;
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(dst + 4 * dst_stride);
+        __builtin_prefetch(dst + 5 * dst_stride);
+        __builtin_prefetch(dst + 6 * dst_stride);
+        __builtin_prefetch(dst + 7 * dst_stride);
+        transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
 
-      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
-      q0x2u32 =
-          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        __builtin_prefetch(src + 7 * src_stride);
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+                         filter4);
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+                         filter4);
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+                         filter4);
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+                         filter4);
 
-      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
-      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
-      q12u16 = vmovl_u8(d28u8);
-      q13u16 = vmovl_u8(d29u8);
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0);
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0);
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0);
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1);
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1);
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1);
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1);
+        dst += dst_stride;
+        h -= 8;
+      } while (h > 0);
+    } else {
+      uint8_t *d;
+      int16x8_t s11, s12, s13, s14;
 
-      __builtin_prefetch(psrc + 64 + src_stride * 2);
+      do {
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        __builtin_prefetch(src + 7 * src_stride);
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        width = w;
+        s = src + 7;
+        d = dst;
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(dst + 4 * dst_stride);
+        __builtin_prefetch(dst + 5 * dst_stride);
+        __builtin_prefetch(dst + 6 * dst_stride);
+        __builtin_prefetch(dst + 7 * dst_stride);
 
-      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
-                             d23s16, d24s16, q0s16);
-      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
-                             d24s16, d26s16, q0s16);
-      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
-                              d26s16, d27s16, q0s16);
-      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
-                              d27s16, d25s16, q0s16);
+        do {
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-      __builtin_prefetch(psrc + 60 + src_stride * 3);
+          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+                           filter4);
+          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+                           filter4);
+          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+                           filter4);
+          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+                           filter4);
+          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
+                           filter4);
+          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
+                           filter4);
+          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
+                           filter4);
+          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
+                           filter3, filter4);
 
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
 
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u8 = vqmovn_u16(q1u16);
-      d3u8 = vqmovn_u16(q2u16);
-
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
-      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
-                         vreinterpret_u32_u16(d0x2u16.val[1]));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
-                       vreinterpret_u8_u32(d0x2u32.val[1]));
-
-      d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
-      d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
-
-      d = pdst;
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
-      q8u16 = q9u16;
-      d20s16 = d23s16;
-      q11u16 = q12u16;
-      q9u16 = q13u16;
-      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+          s0 = s8;
+          s1 = s9;
+          s2 = s10;
+          s3 = s11;
+          s4 = s12;
+          s5 = s13;
+          s6 = s14;
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width > 0);
+        src += 8 * src_stride;
+        dst += 8 * dst_stride;
+        h -= 8;
+      } while (h > 0);
+    }
+  }
+}
+
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+  uint8x8_t t0, t1, t2, t3;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (h == 4) {
+    uint8x8_t d01, d23;
+    int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
+        d1, d2, d3;
+    int16x8_t tt0, tt1, tt2, tt3;
+    uint32x4_t d0123 = vdupq_n_u32(0);
+
+    __builtin_prefetch(src + 0 * src_stride);
+    __builtin_prefetch(src + 1 * src_stride);
+    __builtin_prefetch(src + 2 * src_stride);
+    __builtin_prefetch(src + 3 * src_stride);
+    filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
+    filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+    tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+    tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+    tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+    tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+    s0 = vget_low_s16(tt0);
+    s1 = vget_low_s16(tt1);
+    s2 = vget_low_s16(tt2);
+    s3 = vget_low_s16(tt3);
+    s4 = vget_high_s16(tt0);
+    s5 = vget_high_s16(tt1);
+    s6 = vget_high_s16(tt2);
+    __builtin_prefetch(dst + 0 * dst_stride);
+    __builtin_prefetch(dst + 1 * dst_stride);
+    __builtin_prefetch(dst + 2 * dst_stride);
+    __builtin_prefetch(dst + 3 * dst_stride);
+    src += 7;
+
+    do {
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s7 = vget_low_s16(tt0);
+      s8 = vget_low_s16(tt1);
+      s9 = vget_low_s16(tt2);
+      s10 = vget_low_s16(tt3);
+
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+                       filter4);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+                       filter4);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+                       filter4);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+                       filter4);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+      transpose_u8_4x4(&d01, &d23);
+
+      d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
+      d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
+      d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
+      d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+      d0123 = vreinterpretq_u32_u8(
+          vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+
+      vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
+      vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
+      vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
+      vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      src += 4;
+      dst += 4;
+      w -= 4;
+    } while (w > 0);
+  } else {
+    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
+    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
+    int width;
+    const uint8_t *s;
+    uint8x8_t t4, t5, t6, t7;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+    if (w == 4) {
+      uint32x4_t d0415 = vdupq_n_u32(0);
+      uint32x4_t d2637 = vdupq_n_u32(0);
+      do {
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                    &t7);
+        src += 8 * src_stride;
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(dst + 4 * dst_stride);
+        __builtin_prefetch(dst + 5 * dst_stride);
+        __builtin_prefetch(dst + 6 * dst_stride);
+        __builtin_prefetch(dst + 7 * dst_stride);
+        transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        __builtin_prefetch(src + 7 * src_stride);
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+                         filter4);
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+                         filter4);
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+                         filter4);
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+                         filter4);
+
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+        d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
+        d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2);
+        d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0);
+        d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2);
+        d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1);
+        d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3);
+        d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1);
+        d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3);
+        d0415 = vreinterpretq_u32_u8(
+            vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1)));
+        d2637 = vreinterpretq_u32_u8(
+            vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3)));
+
+        vst1q_lane_u32((uint32_t *)dst, d0415, 0);
+        dst += dst_stride;
+        vst1q_lane_u32((uint32_t *)dst, d0415, 2);
+        dst += dst_stride;
+        vst1q_lane_u32((uint32_t *)dst, d2637, 0);
+        dst += dst_stride;
+        vst1q_lane_u32((uint32_t *)dst, d2637, 2);
+        dst += dst_stride;
+        vst1q_lane_u32((uint32_t *)dst, d0415, 1);
+        dst += dst_stride;
+        vst1q_lane_u32((uint32_t *)dst, d0415, 3);
+        dst += dst_stride;
+        vst1q_lane_u32((uint32_t *)dst, d2637, 1);
+        dst += dst_stride;
+        vst1q_lane_u32((uint32_t *)dst, d2637, 3);
+        dst += dst_stride;
+        h -= 8;
+      } while (h > 0);
+    } else {
+      uint8_t *d;
+      int16x8_t s11, s12, s13, s14;
+      uint8x16_t d01, d23, d45, d67;
+
+      do {
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        __builtin_prefetch(src + 7 * src_stride);
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        width = w;
+        s = src + 7;
+        d = dst;
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(dst + 4 * dst_stride);
+        __builtin_prefetch(dst + 5 * dst_stride);
+        __builtin_prefetch(dst + 6 * dst_stride);
+        __builtin_prefetch(dst + 7 * dst_stride);
+
+        do {
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+                           filter4);
+          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+                           filter4);
+          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+                           filter4);
+          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+                           filter4);
+          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
+                           filter4);
+          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
+                           filter4);
+          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
+                           filter4);
+          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
+                           filter3, filter4);
+
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+          d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
+                            vld1_u8(d + 1 * dst_stride));
+          d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
+                            vld1_u8(d + 3 * dst_stride));
+          d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride),
+                            vld1_u8(d + 5 * dst_stride));
+          d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride),
+                            vld1_u8(d + 7 * dst_stride));
+          d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1));
+          d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3));
+          d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
+          d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
+
+          store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
+                       vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
+                       vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
+
+          s0 = s8;
+          s1 = s9;
+          s2 = s10;
+          s3 = s11;
+          s4 = s12;
+          s5 = s13;
+          s6 = s14;
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width > 0);
+        src += 8 * src_stride;
+        dst += 8 * dst_stride;
+        h -= 8;
+      } while (h > 0);
     }
   }
-  return;
 }
 
 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x,  // unused
-                             int x_step_q4,            // unused
-                             const int16_t *filter_y, int y_step_q4, int w,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
                              int h) {
-  int height;
-  const uint8_t *s;
-  uint8_t *d;
-  uint32x2_t d2u32, d3u32;
-  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);
 
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
   assert(y_step_q4 == 16);
 
-  src -= src_stride * 3;
-  q0s16 = vld1q_s16(filter_y);
-  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
-    s = src;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
-    s += src_stride;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
-    s += src_stride;
-    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
-    s += src_stride;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
 
-    q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
-    q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
-    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
-    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+  src -= 3 * src_stride;
 
-    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d = dst;
-    for (height = h; height > 0; height -= 4) {  // loop_vert
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+  if (w == 4) {
+    const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
+    const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
+    uint8x8_t d01, d23;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+
+    do {
+      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(dst + 1 * dst_stride);
+      __builtin_prefetch(dst + 2 * dst_stride);
+      __builtin_prefetch(dst + 3 * dst_stride);
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+                       filter4);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+                       filter4);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+                       filter4);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+                       filter4);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
+      dst += dst_stride;
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
+    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
+    int height;
+    const uint8_t *s;
+    uint8_t *d;
+    uint8x8_t t0, t1, t2, t3;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+    do {
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      __builtin_prefetch(src + 4 * src_stride);
+      __builtin_prefetch(src + 5 * src_stride);
+      __builtin_prefetch(src + 6 * src_stride);
+      s = src;
+      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
       s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
       s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
       s += src_stride;
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
       s += src_stride;
+      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      d = dst;
+      height = h;
 
-      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
-      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+      do {
+        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
 
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        __builtin_prefetch(d + 0 * dst_stride);
+        __builtin_prefetch(d + 1 * dst_stride);
+        __builtin_prefetch(d + 2 * dst_stride);
+        __builtin_prefetch(d + 3 * dst_stride);
+        __builtin_prefetch(s + 0 * src_stride);
+        __builtin_prefetch(s + 1 * src_stride);
+        __builtin_prefetch(s + 2 * src_stride);
+        __builtin_prefetch(s + 3 * src_stride);
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+                         filter4);
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+                         filter4);
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+                         filter4);
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+                         filter4);
 
-      __builtin_prefetch(d);
-      __builtin_prefetch(d + dst_stride);
-      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
-                             d22s16, d24s16, q0s16);
-      __builtin_prefetch(d + dst_stride * 2);
-      __builtin_prefetch(d + dst_stride * 3);
-      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
-                             d24s16, d26s16, q0s16);
-      __builtin_prefetch(s);
-      __builtin_prefetch(s + src_stride);
-      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
-                              d26s16, d27s16, q0s16);
-      __builtin_prefetch(s + src_stride * 2);
-      __builtin_prefetch(s + src_stride * 3);
-      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
-                              d27s16, d25s16, q0s16);
+        vst1_u8(d, t0);
+        d += dst_stride;
+        vst1_u8(d, t1);
+        d += dst_stride;
+        vst1_u8(d, t2);
+        d += dst_stride;
+        vst1_u8(d, t3);
+        d += dst_stride;
 
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
-      d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
-
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-      d += dst_stride;
-
-      q8u16 = q10u16;
-      d18s16 = d22s16;
-      d19s16 = d24s16;
-      q10u16 = q13u16;
-      d22s16 = d25s16;
-    }
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        height -= 4;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
+    const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
+    uint8x8_t d01, d23;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    uint32x4_t d0123 = vdupq_n_u32(0);
+
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+
+    do {
+      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(dst + 1 * dst_stride);
+      __builtin_prefetch(dst + 2 * dst_stride);
+      __builtin_prefetch(dst + 3 * dst_stride);
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+                       filter4);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+                       filter4);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+                       filter4);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+                       filter4);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+
+      d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
+      d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1);
+      d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2);
+      d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+      d0123 = vreinterpretq_u32_u8(
+          vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+
+      vst1q_lane_u32((uint32_t *)dst, d0123, 0);
+      dst += dst_stride;
+      vst1q_lane_u32((uint32_t *)dst, d0123, 1);
+      dst += dst_stride;
+      vst1q_lane_u32((uint32_t *)dst, d0123, 2);
+      dst += dst_stride;
+      vst1q_lane_u32((uint32_t *)dst, d0123, 3);
+      dst += dst_stride;
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
+    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
+    int height;
+    const uint8_t *s;
+    uint8_t *d;
+    uint8x8_t t0, t1, t2, t3;
+    uint8x16_t d01, d23, dd01, dd23;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+    do {
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      __builtin_prefetch(src + 4 * src_stride);
+      __builtin_prefetch(src + 5 * src_stride);
+      __builtin_prefetch(src + 6 * src_stride);
+      s = src;
+      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      d = dst;
+      height = h;
+
+      do {
+        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+
+        __builtin_prefetch(d + 0 * dst_stride);
+        __builtin_prefetch(d + 1 * dst_stride);
+        __builtin_prefetch(d + 2 * dst_stride);
+        __builtin_prefetch(d + 3 * dst_stride);
+        __builtin_prefetch(s + 0 * src_stride);
+        __builtin_prefetch(s + 1 * src_stride);
+        __builtin_prefetch(s + 2 * src_stride);
+        __builtin_prefetch(s + 3 * src_stride);
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+                         filter4);
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+                         filter4);
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+                         filter4);
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+                         filter4);
+
+        d01 = vcombine_u8(t0, t1);
+        d23 = vcombine_u8(t2, t3);
+        dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
+                           vld1_u8(d + 1 * dst_stride));
+        dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
+                           vld1_u8(d + 3 * dst_stride));
+        dd01 = vrhaddq_u8(dd01, d01);
+        dd23 = vrhaddq_u8(dd23, d23);
+
+        vst1_u8(d, vget_low_u8(dd01));
+        d += dst_stride;
+        vst1_u8(d, vget_high_u8(dd01));
+        d += dst_stride;
+        vst1_u8(d, vget_low_u8(dd23));
+        d += dst_stride;
+        vst1_u8(d, vget_high_u8(dd23));
+        d += dst_stride;
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        height -= 4;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
   }
-  return;
 }
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
new file mode 100644
index 0000000000..c1634ed55f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -0,0 +1,133 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6, uint8x8_t *const s7) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+  s += p;
+  *s7 = vld1_u8(s);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3,
+                                uint8x16_t *const s4, uint8x16_t *const s5,
+                                uint8x16_t *const s6, uint8x16_t *const s7) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+  s += p;
+  *s4 = vld1q_u8(s);
+  s += p;
+  *s5 = vld1q_u8(s);
+  s += p;
+  *s6 = vld1q_u8(s);
+  s += p;
+  *s7 = vld1q_u8(s);
+}
+
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+                                    const int16x4_t s2, const int16x4_t s3,
+                                    const int16x4_t s4, const int16x4_t s5,
+                                    const int16x4_t s6, const int16x4_t s7,
+                                    const int16x8_t filters,
+                                    const int16x4_t filter3,
+                                    const int16x4_t filter4) {
+  const int16x4_t filters_lo = vget_low_s16(filters);
+  const int16x4_t filters_hi = vget_high_s16(filters);
+  int16x4_t sum;
+
+  sum = vmul_lane_s16(s0, filters_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filters_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filters_lo, 2);
+  sum = vmla_lane_s16(sum, s5, filters_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filters_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filters_hi, 3);
+  sum = vqadd_s16(sum, vmul_s16(s3, filter3));
+  sum = vqadd_s16(sum, vmul_s16(s4, filter4));
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x8_t s4, const int16x8_t s5,
+                                    const int16x8_t s6, const int16x8_t s7,
+                                    const int16x8_t filters,
+                                    const int16x8_t filter3,
+                                    const int16x8_t filter4) {
+  const int16x4_t filters_lo = vget_low_s16(filters);
+  const int16x4_t filters_hi = vget_high_s16(filters);
+  int16x8_t sum;
+
+  sum = vmulq_lane_s16(s0, filters_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
+  sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
+  sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
+  sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
+  return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
+                                       const int16x8_t filters) {
+  const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
+  const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
+  int16x8_t ss[8];
+
+  ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+  ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+  ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+  ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+  ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
+  ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
+  ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
+  ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
+
+  return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
+                     filters, filter3, filter4);
+}
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
index 2d0f2ae065..5eee15664d 100644
--- a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
@@ -42,10 +42,11 @@
 ; r1    int src_stride
 ; r2    uint8_t *dst
 ; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
+; sp[]const int16_t *filter
+; sp[]int x0_q4
+; sp[]int x_step_q4 ; unused
+; sp[]int y0_q4
+; sp[]int y_step_q4 ; unused
 ; sp[]int w
 ; sp[]int h
 
@@ -54,11 +55,11 @@
 
     sub             r0, r0, #3              ; adjust for taps
 
-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
+    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4
+    add             r4, r5, lsl #4
+    ldrd            r6, r7, [sp, #52]       ; w, h
 
-    vld1.s16        {q0}, [r5]              ; filter_x
+    vld1.s16        {q0}, [r4]              ; filter
 
     sub             r8, r1, r1, lsl #2      ; -src_stride * 3
     add             r8, r8, #4              ; -src_stride * 3 + 4
@@ -119,7 +120,7 @@ vpx_convolve8_loop_horiz
 
     pld             [r5, r1, lsl #1]
 
-    ; src[] * filter_x
+    ; src[] * filter
     MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
     MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
     MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
@@ -173,11 +174,13 @@ vpx_convolve8_loop_horiz
     sub             r0, r0, r1
     sub             r0, r0, r1, lsl #1
 
-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
+    ldr             r4, [sp, #24]           ; filter
+    ldr             r5, [sp, #36]           ; y0_q4
+    add             r4, r5, lsl #4
+    ldr             r6, [sp, #44]           ; w
+    ldr             lr, [sp, #48]           ; h
 
-    vld1.s16        {q0}, [r4]              ; filter_y
+    vld1.s16        {q0}, [r4]              ; filter
 
     lsl             r1, r1, #1
     lsl             r3, r3, #1
@@ -216,7 +219,7 @@ vpx_convolve8_loop_vert
     pld             [r5]
     pld             [r8]
 
-    ; src[] * filter_y
+    ; src[] * filter
     MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
 
     pld             [r5, r3]
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
index abc2511ea2..07349d03ae 100644
--- a/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -13,132 +13,127 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_convolve_avg_neon(const uint8_t *src,    // r0
-                           ptrdiff_t src_stride,  // r1
-                           uint8_t *dst,          // r2
-                           ptrdiff_t dst_stride,  // r3
-                           const int16_t *filter_x, int filter_x_stride,
-                           const int16_t *filter_y, int filter_y_stride, int w,
-                           int h) {
-  uint8_t *d;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  uint32x2_t d0u32, d2u32;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
-  d = dst;
-  if (w > 32) {  // avg64
-    for (; h > 0; h -= 1) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
-      q2u8 = vld1q_u8(src + 32);
-      q3u8 = vld1q_u8(src + 48);
+  if (w < 8) {  // avg4
+    uint8x8_t s0, s1;
+    uint8x8_t dd0 = vdup_n_u8(0);
+    uint32x2x2_t s01;
+    do {
+      s0 = vld1_u8(src);
       src += src_stride;
-      q8u8 = vld1q_u8(d);
-      q9u8 = vld1q_u8(d + 16);
-      q10u8 = vld1q_u8(d + 32);
-      q11u8 = vld1q_u8(d + 48);
-      d += dst_stride;
-
-      q0u8 = vrhaddq_u8(q0u8, q8u8);
-      q1u8 = vrhaddq_u8(q1u8, q9u8);
-      q2u8 = vrhaddq_u8(q2u8, q10u8);
-      q3u8 = vrhaddq_u8(q3u8, q11u8);
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      vst1q_u8(dst + 32, q2u8);
-      vst1q_u8(dst + 48, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w == 32) {  // avg32
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
+      s1 = vld1_u8(src);
       src += src_stride;
-      q2u8 = vld1q_u8(src);
-      q3u8 = vld1q_u8(src + 16);
-      src += src_stride;
-      q8u8 = vld1q_u8(d);
-      q9u8 = vld1q_u8(d + 16);
-      d += dst_stride;
-      q10u8 = vld1q_u8(d);
-      q11u8 = vld1q_u8(d + 16);
-      d += dst_stride;
-
-      q0u8 = vrhaddq_u8(q0u8, q8u8);
-      q1u8 = vrhaddq_u8(q1u8, q9u8);
-      q2u8 = vrhaddq_u8(q2u8, q10u8);
-      q3u8 = vrhaddq_u8(q3u8, q11u8);
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
+      s01 = vzip_u32(vreinterpret_u32_u8(s0), vreinterpret_u32_u8(s1));
+      dd0 = vreinterpret_u8_u32(
+          vld1_lane_u32((const uint32_t *)dst, vreinterpret_u32_u8(dd0), 0));
+      dd0 = vreinterpret_u8_u32(vld1_lane_u32(
+          (const uint32_t *)(dst + dst_stride), vreinterpret_u32_u8(dd0), 1));
+      dd0 = vrhadd_u8(vreinterpret_u8_u32(s01.val[0]), dd0);
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 0);
       dst += dst_stride;
-      vst1q_u8(dst, q2u8);
-      vst1q_u8(dst + 16, q3u8);
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 1);
       dst += dst_stride;
-    }
-  } else if (w > 8) {  // avg16
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      src += src_stride;
-      q1u8 = vld1q_u8(src);
-      src += src_stride;
-      q2u8 = vld1q_u8(d);
-      d += dst_stride;
-      q3u8 = vld1q_u8(d);
-      d += dst_stride;
-
-      q0u8 = vrhaddq_u8(q0u8, q2u8);
-      q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-      vst1q_u8(dst, q0u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q1u8);
-      dst += dst_stride;
-    }
+      h -= 2;
+    } while (h > 0);
   } else if (w == 8) {  // avg8
-    for (; h > 0; h -= 2) {
-      d0u8 = vld1_u8(src);
+    uint8x8_t s0, s1, d0, d1;
+    uint8x16_t s01, d01;
+    do {
+      s0 = vld1_u8(src);
       src += src_stride;
-      d1u8 = vld1_u8(src);
+      s1 = vld1_u8(src);
       src += src_stride;
-      d2u8 = vld1_u8(d);
-      d += dst_stride;
-      d3u8 = vld1_u8(d);
-      d += dst_stride;
+      d0 = vld1_u8(dst);
+      d1 = vld1_u8(dst + dst_stride);
 
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      q0u8 = vrhaddq_u8(q0u8, q1u8);
+      s01 = vcombine_u8(s0, s1);
+      d01 = vcombine_u8(d0, d1);
+      d01 = vrhaddq_u8(s01, d01);
 
-      vst1_u8(dst, vget_low_u8(q0u8));
+      vst1_u8(dst, vget_low_u8(d01));
       dst += dst_stride;
-      vst1_u8(dst, vget_high_u8(q0u8));
+      vst1_u8(dst, vget_high_u8(d01));
       dst += dst_stride;
-    }
-  } else {  // avg4
-    for (; h > 0; h -= 2) {
-      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
+      h -= 2;
+    } while (h > 0);
+  } else if (w < 32) {  // avg16
+    uint8x16_t s0, s1, d0, d1;
+    do {
+      s0 = vld1q_u8(src);
       src += src_stride;
-      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
+      s1 = vld1q_u8(src);
       src += src_stride;
-      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
-      d += dst_stride;
+      d0 = vld1q_u8(dst);
+      d1 = vld1q_u8(dst + dst_stride);
 
-      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32));
+      d0 = vrhaddq_u8(s0, d0);
+      d1 = vrhaddq_u8(s1, d1);
 
-      d0u32 = vreinterpret_u32_u8(d0u8);
-      vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+      vst1q_u8(dst, d0);
       dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, d0u32, 1);
+      vst1q_u8(dst, d1);
       dst += dst_stride;
-    }
+      h -= 2;
+    } while (h > 0);
+  } else if (w == 32) {  // avg32
+    uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3;
+    do {
+      s0 = vld1q_u8(src);
+      s1 = vld1q_u8(src + 16);
+      src += src_stride;
+      s2 = vld1q_u8(src);
+      s3 = vld1q_u8(src + 16);
+      src += src_stride;
+      d0 = vld1q_u8(dst);
+      d1 = vld1q_u8(dst + 16);
+      d2 = vld1q_u8(dst + dst_stride);
+      d3 = vld1q_u8(dst + dst_stride + 16);
+
+      d0 = vrhaddq_u8(s0, d0);
+      d1 = vrhaddq_u8(s1, d1);
+      d2 = vrhaddq_u8(s2, d2);
+      d3 = vrhaddq_u8(s3, d3);
+
+      vst1q_u8(dst, d0);
+      vst1q_u8(dst + 16, d1);
+      dst += dst_stride;
+      vst1q_u8(dst, d2);
+      vst1q_u8(dst + 16, d3);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {  // avg64
+    uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3;
+    do {
+      s0 = vld1q_u8(src);
+      s1 = vld1q_u8(src + 16);
+      s2 = vld1q_u8(src + 32);
+      s3 = vld1q_u8(src + 48);
+      src += src_stride;
+      d0 = vld1q_u8(dst);
+      d1 = vld1q_u8(dst + 16);
+      d2 = vld1q_u8(dst + 32);
+      d3 = vld1q_u8(dst + 48);
+
+      d0 = vrhaddq_u8(s0, d0);
+      d1 = vrhaddq_u8(s1, d1);
+      d2 = vrhaddq_u8(s2, d2);
+      d3 = vrhaddq_u8(s3, d3);
+
+      vst1q_u8(dst, d0);
+      vst1q_u8(dst + 16, d1);
+      vst1q_u8(dst + 32, d2);
+      vst1q_u8(dst + 48, d3);
+      dst += dst_stride;
+    } while (--h);
   }
-  return;
 }
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
index 97e6189fda..efd6574f1f 100644
--- a/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
@@ -17,7 +17,7 @@
 
 |vpx_convolve_avg_neon| PROC
     push                {r4-r6, lr}
-    ldrd                r4, r5, [sp, #32]
+    ldrd                r4, r5, [sp, #36]
     mov                 r6, r2
 
     cmp                 r4, #32
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
index fec189e0e4..7abed67a40 100644
--- a/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -13,80 +13,87 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_convolve_copy_neon(const uint8_t *src,    // r0
-                            ptrdiff_t src_stride,  // r1
-                            uint8_t *dst,          // r2
-                            ptrdiff_t dst_stride,  // r3
-                            const int16_t *filter_x, int filter_x_stride,
-                            const int16_t *filter_y, int filter_y_stride, int w,
+void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
-  uint8x8_t d0u8, d2u8;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
-  if (w > 32) {  // copy64
-    for (; h > 0; h--) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
-      q2u8 = vld1q_u8(src + 32);
-      q3u8 = vld1q_u8(src + 48);
-      src += src_stride;
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      vst1q_u8(dst + 32, q2u8);
-      vst1q_u8(dst + 48, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w == 32) {  // copy32
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
-      src += src_stride;
-      q2u8 = vld1q_u8(src);
-      q3u8 = vld1q_u8(src + 16);
-      src += src_stride;
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q2u8);
-      vst1q_u8(dst + 16, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w > 8) {  // copy16
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      src += src_stride;
-      q1u8 = vld1q_u8(src);
-      src += src_stride;
-
-      vst1q_u8(dst, q0u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q1u8);
-      dst += dst_stride;
-    }
-  } else if (w == 8) {  // copy8
-    for (; h > 0; h -= 2) {
-      d0u8 = vld1_u8(src);
-      src += src_stride;
-      d2u8 = vld1_u8(src);
-      src += src_stride;
-
-      vst1_u8(dst, d0u8);
-      dst += dst_stride;
-      vst1_u8(dst, d2u8);
-      dst += dst_stride;
-    }
-  } else {  // copy4
-    for (; h > 0; h--) {
+  if (w < 8) {  // copy4
+    do {
       *(uint32_t *)dst = *(const uint32_t *)src;
       src += src_stride;
       dst += dst_stride;
-    }
+      *(uint32_t *)dst = *(const uint32_t *)src;
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (w == 8) {  // copy8
+    uint8x8_t s0, s1;
+    do {
+      s0 = vld1_u8(src);
+      src += src_stride;
+      s1 = vld1_u8(src);
+      src += src_stride;
+
+      vst1_u8(dst, s0);
+      dst += dst_stride;
+      vst1_u8(dst, s1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (w < 32) {  // copy16
+    uint8x16_t s0, s1;
+    do {
+      s0 = vld1q_u8(src);
+      src += src_stride;
+      s1 = vld1q_u8(src);
+      src += src_stride;
+
+      vst1q_u8(dst, s0);
+      dst += dst_stride;
+      vst1q_u8(dst, s1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (w == 32) {  // copy32
+    uint8x16_t s0, s1, s2, s3;
+    do {
+      s0 = vld1q_u8(src);
+      s1 = vld1q_u8(src + 16);
+      src += src_stride;
+      s2 = vld1q_u8(src);
+      s3 = vld1q_u8(src + 16);
+      src += src_stride;
+
+      vst1q_u8(dst, s0);
+      vst1q_u8(dst + 16, s1);
+      dst += dst_stride;
+      vst1q_u8(dst, s2);
+      vst1q_u8(dst + 16, s3);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {  // copy64
+    uint8x16_t s0, s1, s2, s3;
+    do {
+      s0 = vld1q_u8(src);
+      s1 = vld1q_u8(src + 16);
+      s2 = vld1q_u8(src + 32);
+      s3 = vld1q_u8(src + 48);
+      src += src_stride;
+
+      vst1q_u8(dst, s0);
+      vst1q_u8(dst + 16, s1);
+      vst1q_u8(dst + 32, s2);
+      vst1q_u8(dst + 48, s3);
+      dst += dst_stride;
+    } while (--h);
   }
-  return;
 }
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
index 89164ad48b..7a66e3ce2f 100644
--- a/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
@@ -17,7 +17,7 @@
 
 |vpx_convolve_copy_neon| PROC
     push                {r4-r5, lr}
-    ldrd                r4, r5, [sp, #28]
+    ldrd                r4, r5, [sp, #32]
 
     cmp                 r4, #32
     bgt                 copy64
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
index c2d5895b71..2bf2d890be 100644
--- a/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
@@ -15,41 +15,40 @@
 #include "vpx_ports/mem.h"
 
 void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
   /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
    * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
    */
-  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+  uint8_t temp[64 * 72];
 
   // Account for the vertical phase needing 3 lines prior and 4 lines post
-  int intermediate_height = h + 7;
+  const int intermediate_height = h + 7;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
 
-  /* Filter starting 3 lines back. The neon implementation will ignore the
-   * given height and filter a multiple of 4 lines. Since this goes in to
-   * the temp buffer which has lots of extra room and is subsequently discarded
-   * this is safe if somewhat less than ideal.
-   */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w,
+  /* Filter starting 3 lines back. The neon implementation will ignore the given
+   * height and filter a multiple of 4 lines. Since this goes in to the temp
+   * buffer which has lots of extra room and is subsequently discarded this is
+   * safe if somewhat less than ideal.   */
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+                           x0_q4, x_step_q4, y0_q4, y_step_q4, w,
                            intermediate_height);
 
   /* Step into the temp buffer 3 lines to get the actual frame data */
-  vpx_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
-                          x_step_q4, filter_y, y_step_q4, w, h);
+  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+                          x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
-  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
-  int intermediate_height = h + 7;
+  uint8_t temp[64 * 72];
+  const int intermediate_height = h + 7;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
@@ -57,9 +56,9 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
   /* This implementation has the same issues as above. In addition, we only want
    * to average the values after both passes.
    */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w,
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+                           x0_q4, x_step_q4, y0_q4, y_step_q4, w,
                            intermediate_height);
-  vpx_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
 }
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
new file mode 100644
index 0000000000..8edf8a66e6
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -0,0 +1,324 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void scaledconvolve_horiz_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+    const int x0_q4, const int x_step_q4, const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+  int x, y, z;
+
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  y = h;
+  do {
+    int x_q4 = x0_q4;
+    x = 0;
+    do {
+      // process 4 src_x steps
+      for (z = 0; z < 4; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        if (x_q4 & SUBPEL_MASK) {
+          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+          const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
+          const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
+          uint8x8_t s[8], d;
+          int16x8_t ss[4];
+          int16x4_t t[8], tt;
+
+          load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
+          transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
+
+          ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+          ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+          ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+          ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+          t[0] = vget_low_s16(ss[0]);
+          t[1] = vget_low_s16(ss[1]);
+          t[2] = vget_low_s16(ss[2]);
+          t[3] = vget_low_s16(ss[3]);
+          t[4] = vget_high_s16(ss[0]);
+          t[5] = vget_high_s16(ss[1]);
+          t[6] = vget_high_s16(ss[2]);
+          t[7] = vget_high_s16(ss[3]);
+
+          tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
+                           filters, filter3, filter4);
+          d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+          vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
+        } else {
+          int i;
+          for (i = 0; i < 4; ++i) {
+            temp[z * 4 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 4x4 filters values back to dst
+      {
+        const uint8x8x4_t d4 = vld4_u8(temp);
+        vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[0]), 0);
+        vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[1]), 0);
+        vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[2]), 0);
+        vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[3]), 0);
+      }
+      x += 4;
+    } while (x < w);
+
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+    y -= 4;
+  } while (y > 0);
+}
+
+static INLINE void scaledconvolve_horiz_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+    const int x0_q4, const int x_step_q4, const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 8x8 areas. The intermediate height is not always
+  // a multiple of 8, so force it to be a multiple of 8 here.
+  y = (h + 7) & ~7;
+
+  do {
+    int x_q4 = x0_q4;
+    x = 0;
+    do {
+      uint8x8_t d[8];
+      // process 8 src_x steps
+      for (z = 0; z < 8; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+
+        if (x_q4 & SUBPEL_MASK) {
+          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+          uint8x8_t s[8];
+          load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
+                      &s[5], &s[6], &s[7]);
+          transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                           &s[7]);
+          d[0] = scale_filter_8(s, filters);
+          vst1_u8(&temp[8 * z], d[0]);
+        } else {
+          int i;
+          for (i = 0; i < 8; ++i) {
+            temp[z * 8 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 8x8 filters values back to dst
+      load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+                  &d[7]);
+      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      vst1_u8(&dst[x + 0 * dst_stride], d[0]);
+      vst1_u8(&dst[x + 1 * dst_stride], d[1]);
+      vst1_u8(&dst[x + 2 * dst_stride], d[2]);
+      vst1_u8(&dst[x + 3 * dst_stride], d[3]);
+      vst1_u8(&dst[x + 4 * dst_stride], d[4]);
+      vst1_u8(&dst[x + 5 * dst_stride], d[5]);
+      vst1_u8(&dst[x + 6 * dst_stride], d[6]);
+      vst1_u8(&dst[x + 7 * dst_stride], d[7]);
+      x += 8;
+    } while (x < w);
+
+    src += src_stride * 8;
+    dst += dst_stride * 8;
+  } while (y -= 8);
+}
+
+static INLINE void scaledconvolve_vert_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+    if (y_q4 & SUBPEL_MASK) {
+      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+      const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
+      const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
+      uint8x8_t s[8], d;
+      int16x4_t t[8], tt;
+
+      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                  &s[6], &s[7]);
+      t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
+      t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
+      t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
+      t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
+      t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
+      t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
+      t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
+      t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
+
+      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
+                       filter3, filter4);
+      d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
+    }
+
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    if (y_q4 & SUBPEL_MASK) {
+      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+      uint8x8_t s[8], d;
+      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                  &s[6], &s[7]);
+      d = scale_filter_8(s, filters);
+      vst1_u8(dst, d);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
+    }
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w16(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int x, y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    if (y_q4 & SUBPEL_MASK) {
+      x = 0;
+      do {
+        const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+        uint8x16_t ss[8];
+        uint8x8_t s[8], d[2];
+        load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
+                     &ss[5], &ss[6], &ss[7]);
+        s[0] = vget_low_u8(ss[0]);
+        s[1] = vget_low_u8(ss[1]);
+        s[2] = vget_low_u8(ss[2]);
+        s[3] = vget_low_u8(ss[3]);
+        s[4] = vget_low_u8(ss[4]);
+        s[5] = vget_low_u8(ss[5]);
+        s[6] = vget_low_u8(ss[6]);
+        s[7] = vget_low_u8(ss[7]);
+        d[0] = scale_filter_8(s, filters);
+
+        s[0] = vget_high_u8(ss[0]);
+        s[1] = vget_high_u8(ss[1]);
+        s[2] = vget_high_u8(ss[2]);
+        s[3] = vget_high_u8(ss[3]);
+        s[4] = vget_high_u8(ss[4]);
+        s[5] = vget_high_u8(ss[5]);
+        s[6] = vget_high_u8(ss[6]);
+        s[7] = vget_high_u8(ss[7]);
+        d[1] = scale_filter_8(s, filters);
+        vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
+        src_y += 16;
+        x += 16;
+      } while (x < w);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
+    }
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                        int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if (w >= 8) {
+    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  } else {
+    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  }
+
+  if (w >= 16) {
+    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                            dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else if (w == 8) {
+    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else {
+    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/avg.c b/libs/libvpx/vpx_dsp/avg.c
index 4d9abb8de3..a7ac6d9538 100644
--- a/libs/libvpx/vpx_dsp/avg.c
+++ b/libs/libvpx/vpx_dsp/avg.c
@@ -34,7 +34,7 @@ unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
 
 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
 //           second pass, 12 bit, dynamic range [-2040, 2040]
-static void hadamard_col8(const int16_t *src_diff, int src_stride,
+static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
                           int16_t *coeff) {
   int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
   int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
@@ -66,10 +66,11 @@ static void hadamard_col8(const int16_t *src_diff, int src_stride,
 
 // The order of the output coeff of the hadamard is not important. For
 // optimization purposes the final transpose may be skipped.
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
-                        int16_t *coeff) {
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                        tran_low_t *coeff) {
   int idx;
   int16_t buffer[64];
+  int16_t buffer2[64];
   int16_t *tmp_buf = &buffer[0];
   for (idx = 0; idx < 8; ++idx) {
     hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
@@ -80,17 +81,19 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
 
   tmp_buf = &buffer[0];
   for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit
-                                       // dynamic range [-2040, 2040]
-    coeff += 8;                        // coeff: 15 bit
-                                       // dynamic range [-16320, 16320]
+    hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
+    // dynamic range [-2040, 2040]
+    // buffer2: 15 bit
+    // dynamic range [-16320, 16320]
     ++tmp_buf;
   }
+
+  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
 }
 
 // In place 16x16 2D Hadamard transform
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
-                          int16_t *coeff) {
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                          tran_low_t *coeff) {
   int idx;
   for (idx = 0; idx < 4; ++idx) {
     // src_diff: 9 bit, dynamic range [-255, 255]
@@ -101,15 +104,15 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
 
   // coeff: 15 bit, dynamic range [-16320, 16320]
   for (idx = 0; idx < 64; ++idx) {
-    int16_t a0 = coeff[0];
-    int16_t a1 = coeff[64];
-    int16_t a2 = coeff[128];
-    int16_t a3 = coeff[192];
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[64];
+    tran_low_t a2 = coeff[128];
+    tran_low_t a3 = coeff[192];
 
-    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
-    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
-    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
-    int16_t b3 = (a2 - a3) >> 1;
+    tran_low_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
+    tran_low_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
+    tran_low_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
+    tran_low_t b3 = (a2 - a3) >> 1;
 
     coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
     coeff[64] = b1 + b3;
@@ -122,7 +125,7 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
-int vpx_satd_c(const int16_t *coeff, int length) {
+int vpx_satd_c(const tran_low_t *coeff, int length) {
   int i;
   int satd = 0;
   for (i = 0; i < length; ++i) satd += abs(coeff[i]);
diff --git a/libs/libvpx/vpx_dsp/bitreader_buffer.c b/libs/libvpx/vpx_dsp/bitreader_buffer.c
index e99fffb605..3e16bfa38c 100644
--- a/libs/libvpx/vpx_dsp/bitreader_buffer.c
+++ b/libs/libvpx/vpx_dsp/bitreader_buffer.c
@@ -40,11 +40,5 @@ int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits) {
 }
 
 int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits) {
-#if CONFIG_MISC_FIXES
-  const int nbits = sizeof(unsigned) * 8 - bits - 1;
-  const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits;
-  return ((int)value) >> nbits;
-#else
   return vpx_rb_read_signed_literal(rb, bits);
-#endif
 }
diff --git a/libs/libvpx/vpx_dsp/bitwriter_buffer.c b/libs/libvpx/vpx_dsp/bitwriter_buffer.c
index 1043cdc784..7a7e96f02e 100644
--- a/libs/libvpx/vpx_dsp/bitwriter_buffer.c
+++ b/libs/libvpx/vpx_dsp/bitwriter_buffer.c
@@ -38,10 +38,6 @@ void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) {
 
 void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data,
                                      int bits) {
-#if CONFIG_MISC_FIXES
-  vpx_wb_write_literal(wb, data, bits + 1);
-#else
   vpx_wb_write_literal(wb, abs(data), bits);
   vpx_wb_write_bit(wb, data < 0);
-#endif
 }
diff --git a/libs/libvpx/vpx_dsp/deblock.c b/libs/libvpx/vpx_dsp/deblock.c
index 589b124e26..94acbb3919 100644
--- a/libs/libvpx/vpx_dsp/deblock.c
+++ b/libs/libvpx/vpx_dsp/deblock.c
@@ -7,7 +7,9 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <assert.h>
 #include <stdlib.h>
+#include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
 const int16_t vpx_rv[] = {
@@ -48,6 +50,9 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
   unsigned char v;
   unsigned char d[4];
 
+  assert(size >= 8);
+  assert(cols >= 8);
+
   for (row = 0; row < size; row++) {
     /* post_proc_down for one row */
     p_src = src_ptr;
@@ -117,7 +122,7 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
   unsigned char d[16];
 
   for (r = 0; r < rows; r++) {
-    int sumsq = 0;
+    int sumsq = 16;
     int sum = 0;
 
     for (i = -8; i < 0; i++) s[i] = s[0];
@@ -156,14 +161,12 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
                             int flimit) {
   int r, c, i;
-  const int16_t *rv3 = &vpx_rv[63 & rand()];
 
   for (c = 0; c < cols; c++) {
     unsigned char *s = &dst[c];
     int sumsq = 0;
     int sum = 0;
     unsigned char d[16];
-    const int16_t *rv2 = rv3 + ((c * 17) & 127);
 
     for (i = -8; i < 0; i++) s[i * pitch] = s[0];
 
@@ -183,7 +186,7 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
       d[r & 15] = s[0];
 
       if (sumsq * 15 - sum * sum < flimit) {
-        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+        d[r & 15] = (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4;
       }
       if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15];
       s += pitch;
diff --git a/libs/libvpx/vpx_dsp/fastssim.c b/libs/libvpx/vpx_dsp/fastssim.c
index 4d5eb5a6ff..0469071a17 100644
--- a/libs/libvpx/vpx_dsp/fastssim.c
+++ b/libs/libvpx/vpx_dsp/fastssim.c
@@ -202,6 +202,7 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
   if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
 #else
   assert(bit_depth == 8);
+  (void)bit_depth;
 #endif
   w = _ctx->level[_l].w;
   h = _ctx->level[_l].h;
@@ -326,6 +327,7 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
   if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
 #else
   assert(bit_depth == 8);
+  (void)bit_depth;
 #endif
 
   w = _ctx->level[_l].w;
diff --git a/libs/libvpx/vpx_dsp/fwd_txfm.c b/libs/libvpx/vpx_dsp/fwd_txfm.c
index 4e7d4053ea..6dcb3ba668 100644
--- a/libs/libvpx/vpx_dsp/fwd_txfm.c
+++ b/libs/libvpx/vpx_dsp/fwd_txfm.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/fwd_txfm.h"
 
@@ -21,36 +22,37 @@ void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
   int pass;
   // We need an intermediate buffer between passes.
   tran_low_t intermediate[4 * 4];
-  const int16_t *in_pass0 = input;
-  const tran_low_t *in = NULL;
+  const tran_low_t *in_low = NULL;
   tran_low_t *out = intermediate;
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
-    tran_high_t input[4];      // canbe16
+    tran_high_t in_high[4];    // canbe16
     tran_high_t step[4];       // canbe16
     tran_high_t temp1, temp2;  // needs32
     int i;
     for (i = 0; i < 4; ++i) {
       // Load inputs.
-      if (0 == pass) {
-        input[0] = in_pass0[0 * stride] * 16;
-        input[1] = in_pass0[1 * stride] * 16;
-        input[2] = in_pass0[2 * stride] * 16;
-        input[3] = in_pass0[3 * stride] * 16;
-        if (i == 0 && input[0]) {
-          input[0] += 1;
+      if (pass == 0) {
+        in_high[0] = input[0 * stride] * 16;
+        in_high[1] = input[1 * stride] * 16;
+        in_high[2] = input[2 * stride] * 16;
+        in_high[3] = input[3 * stride] * 16;
+        if (i == 0 && in_high[0]) {
+          ++in_high[0];
         }
       } else {
-        input[0] = in[0 * 4];
-        input[1] = in[1 * 4];
-        input[2] = in[2 * 4];
-        input[3] = in[3 * 4];
+        assert(in_low != NULL);
+        in_high[0] = in_low[0 * 4];
+        in_high[1] = in_low[1 * 4];
+        in_high[2] = in_low[2 * 4];
+        in_high[3] = in_low[3 * 4];
+        ++in_low;
       }
       // Transform.
-      step[0] = input[0] + input[3];
-      step[1] = input[1] + input[2];
-      step[2] = input[1] - input[2];
-      step[3] = input[0] - input[3];
+      step[0] = in_high[0] + in_high[3];
+      step[1] = in_high[1] + in_high[2];
+      step[2] = in_high[1] - in_high[2];
+      step[3] = in_high[0] - in_high[3];
       temp1 = (step[0] + step[1]) * cospi_16_64;
       temp2 = (step[0] - step[1]) * cospi_16_64;
       out[0] = (tran_low_t)fdct_round_shift(temp1);
@@ -60,12 +62,11 @@ void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
       out[1] = (tran_low_t)fdct_round_shift(temp1);
       out[3] = (tran_low_t)fdct_round_shift(temp2);
       // Do next column (which is a transposed row in second/horizontal pass)
-      in_pass0++;
-      in++;
+      ++input;
       out += 4;
     }
     // Setup in/out for next pass.
-    in = intermediate;
+    in_low = intermediate;
     out = output;
   }
 
@@ -83,7 +84,7 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
   for (r = 0; r < 4; ++r)
     for (c = 0; c < 4; ++c) sum += input[r * stride + c];
 
-  output[0] = sum << 1;
+  output[0] = sum * 2;
 }
 
 void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
@@ -99,7 +100,6 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
     tran_high_t t0, t1, t2, t3;                  // needs32
     tran_high_t x0, x1, x2, x3;                  // canbe16
 
-    int i;
     for (i = 0; i < 8; i++) {
       // stage 1
       if (pass == 0) {
@@ -190,56 +190,57 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
   int pass;
   // We need an intermediate buffer between passes.
   tran_low_t intermediate[256];
-  const int16_t *in_pass0 = input;
-  const tran_low_t *in = NULL;
+  const tran_low_t *in_low = NULL;
   tran_low_t *out = intermediate;
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
     tran_high_t step1[8];      // canbe16
     tran_high_t step2[8];      // canbe16
     tran_high_t step3[8];      // canbe16
-    tran_high_t input[8];      // canbe16
+    tran_high_t in_high[8];    // canbe16
     tran_high_t temp1, temp2;  // needs32
     int i;
     for (i = 0; i < 16; i++) {
       if (0 == pass) {
         // Calculate input for the first 8 results.
-        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
-        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
-        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
-        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
-        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
-        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
-        input[6] = (in_pass0[6 * stride] + in_pass0[9 * stride]) * 4;
-        input[7] = (in_pass0[7 * stride] + in_pass0[8 * stride]) * 4;
+        in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
+        in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
+        in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
+        in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
+        in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
+        in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
+        in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
+        in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
         // Calculate input for the next 8 results.
-        step1[0] = (in_pass0[7 * stride] - in_pass0[8 * stride]) * 4;
-        step1[1] = (in_pass0[6 * stride] - in_pass0[9 * stride]) * 4;
-        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
-        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
-        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
-        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
-        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
-        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
+        step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
+        step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
+        step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
+        step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
+        step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
+        step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
+        step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
+        step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
       } else {
         // Calculate input for the first 8 results.
-        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
-        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
-        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
-        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
-        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
-        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
-        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[9 * 16] + 1) >> 2);
-        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[8 * 16] + 1) >> 2);
+        assert(in_low != NULL);
+        in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
+        in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
+        in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
+        in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
+        in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
+        in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
+        in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
+        in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
         // Calculate input for the next 8 results.
-        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[8 * 16] + 1) >> 2);
-        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[9 * 16] + 1) >> 2);
-        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
-        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
-        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
-        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
-        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
-        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
+        step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
+        step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
+        step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
+        step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
+        step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
+        step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
+        step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
+        step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
+        in_low++;
       }
       // Work on the first eight values; fdct8(input, even_results);
       {
@@ -248,14 +249,14 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         tran_high_t x0, x1, x2, x3;                  // canbe16
 
         // stage 1
-        s0 = input[0] + input[7];
-        s1 = input[1] + input[6];
-        s2 = input[2] + input[5];
-        s3 = input[3] + input[4];
-        s4 = input[3] - input[4];
-        s5 = input[2] - input[5];
-        s6 = input[1] - input[6];
-        s7 = input[0] - input[7];
+        s0 = in_high[0] + in_high[7];
+        s1 = in_high[1] + in_high[6];
+        s2 = in_high[2] + in_high[5];
+        s3 = in_high[3] + in_high[4];
+        s4 = in_high[3] - in_high[4];
+        s5 = in_high[2] - in_high[5];
+        s6 = in_high[1] - in_high[6];
+        s7 = in_high[0] - in_high[7];
 
         // fdct4(step, step);
         x0 = s0 + s3;
@@ -350,12 +351,11 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         out[15] = (tran_low_t)fdct_round_shift(temp2);
       }
       // Do next column (which is a transposed row in second/horizontal pass)
-      in++;
-      in_pass0++;
+      input++;
       out += 16;
     }
     // Setup in/out for next pass.
-    in = intermediate;
+    in_low = intermediate;
     out = output;
   }
 }
diff --git a/libs/libvpx/vpx_dsp/intrapred.c b/libs/libvpx/vpx_dsp/intrapred.c
index eca17a983e..400e632e98 100644
--- a/libs/libvpx/vpx_dsp/intrapred.c
+++ b/libs/libvpx/vpx_dsp/intrapred.c
@@ -42,23 +42,6 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
       dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
 }
 
-#if CONFIG_MISC_FIXES
-static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                   const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  (void)above;
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
-                            left[(c >> 1) + r + 2])
-                     : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
-    }
-    dst += stride;
-  }
-}
-#endif  // CONFIG_MISC_FIXES
-
 static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -76,22 +59,6 @@ static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   }
 }
 
-#if CONFIG_MISC_FIXES
-static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                  const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  (void)left;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
-                            above[(r >> 1) + c + 2])
-                     : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
-    }
-    dst += stride;
-  }
-}
-#endif  // CONFIG_MISC_FIXES
-
 static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
   const uint8_t above_right = above[bs - 1];
@@ -111,21 +78,6 @@ static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   }
 }
 
-#if CONFIG_MISC_FIXES
-static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                  const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  (void)left;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = AVG3(above[r + c], above[r + c + 1],
-                    above[r + c + 1 + (r + c + 2 < bs * 2)]);
-    }
-    dst += stride;
-  }
-}
-#endif  // CONFIG_MISC_FIXES
-
 static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -367,7 +319,7 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
   DST(3, 3) = AVG3(E, F, G);  // differs from vp8
 }
 
-void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   const int A = above[0];
   const int B = above[1];
@@ -533,75 +485,46 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
-#if CONFIG_MISC_FIXES
-static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
-                                          int bs, const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  int r, c;
-  (void)above;
-  (void)bd;
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
-                            left[(c >> 1) + r + 2])
-                     : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
-    }
-    dst += stride;
-  }
-}
-#endif  // CONFIG_MISC_FIXES
-
 static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
   int r, c;
+  int size;
   (void)left;
   (void)bd;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
-                            above[(r >> 1) + c + 2])
-                     : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
-    }
-    dst += stride;
+  for (c = 0; c < bs; ++c) {
+    dst[c] = AVG2(above[c], above[c + 1]);
+    dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]);
+  }
+  for (r = 2, size = bs - 2; r < bs; r += 2, --size) {
+    memcpy(dst + (r + 0) * stride, dst + (r >> 1), size * sizeof(*dst));
+    vpx_memset16(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
+    memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1),
+           size * sizeof(*dst));
+    vpx_memset16(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
   }
 }
 
-#define highbd_d63e_predictor highbd_d63_predictor
-
 static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
-  int r, c;
+  const uint16_t above_right = above[bs - 1];
+  const uint16_t *const dst_row0 = dst;
+  int x, size;
   (void)left;
   (void)bd;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = r + c + 2 < bs * 2
-                   ? AVG3(above[r + c], above[r + c + 1], above[r + c + 2])
-                   : above[bs * 2 - 1];
-    }
-    dst += stride;
-  }
-}
 
-#if CONFIG_MISC_FIXES
-static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bs, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void)left;
-  (void)bd;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = AVG3(above[r + c], above[r + c + 1],
-                    above[r + c + 1 + (r + c + 2 < bs * 2)]);
-    }
+  for (x = 0; x < bs - 1; ++x) {
+    dst[x] = AVG3(above[x], above[x + 1], above[x + 2]);
+  }
+  dst[bs - 1] = above_right;
+  dst += stride;
+  for (x = 1, size = bs - 2; x < bs; ++x, --size) {
+    memcpy(dst, dst_row0 + x, size * sizeof(*dst));
+    vpx_memset16(dst + size, above_right, x + 1);
     dst += stride;
   }
 }
-#endif  // CONFIG_MISC_FIXES
 
 static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
@@ -633,19 +556,30 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
 static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
-  int r, c;
+  int i;
+#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
+  // silence a spurious -Warray-bounds warning, possibly related to:
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
+  uint16_t border[69];
+#else
+  uint16_t border[32 + 32 - 1];  // outer border from bottom-left to top-right
+#endif
   (void)bd;
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
 
-  dst[stride] = AVG3(above[-1], left[0], left[1]);
-  for (r = 2; r < bs; ++r)
-    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+  // dst(bs, bs - 2)[0], i.e., border starting at bottom-left
+  for (i = 0; i < bs - 2; ++i) {
+    border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
+  }
+  border[bs - 2] = AVG3(above[-1], left[0], left[1]);
+  border[bs - 1] = AVG3(left[0], above[-1], above[0]);
+  border[bs - 0] = AVG3(above[-1], above[0], above[1]);
+  // dst[0][2, size), i.e., remaining top border ascending
+  for (i = 0; i < bs - 2; ++i) {
+    border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]);
+  }
 
-  dst += stride;
-  for (r = 1; r < bs; ++r) {
-    for (c = 1; c < bs; c++) dst[c] = dst[-stride + c - 1];
-    dst += stride;
+  for (i = 0; i < bs; ++i) {
+    memcpy(dst + i * stride, border + bs - 1 - i, bs * sizeof(dst[0]));
   }
 }
 
@@ -776,6 +710,144 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
     dst += stride;
   }
 }
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  (void)above;
+  (void)bd;
+  DST(0, 0) = AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) = AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) = DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  const int E = above[4];
+  const int F = above[5];
+  const int G = above[6];
+  (void)left;
+  (void)bd;
+  DST(0, 0) = AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+  DST(3, 2) = AVG2(E, F);  // differs from vp8
+
+  DST(0, 1) = AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+  DST(3, 3) = AVG3(E, F, G);  // differs from vp8
+}
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  const int E = above[4];
+  const int F = above[5];
+  const int G = above[6];
+  const int H = above[7];
+  (void)left;
+  (void)bd;
+  DST(0, 0) = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+  DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+  DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+  DST(3, 3) = H;  // differs from vp8
+}
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int X = above[-1];
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  (void)bd;
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0) = AVG2(C, D);
+
+  DST(0, 3) = AVG3(K, J, I);
+  DST(0, 2) = AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) = AVG3(B, C, D);
+}
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  const int X = above[-1];
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  (void)bd;
+  DST(0, 3) = AVG3(J, K, L);
+  DST(1, 3) = DST(0, 2) = AVG3(I, J, K);
+  DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J);
+  DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
+  DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
+  DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
+  DST(3, 0) = AVG3(D, C, B);
+}
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  const int X = above[-1];
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  (void)bd;
+
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3) = AVG2(L, K);
+
+  DST(3, 0) = AVG3(A, B, C);
+  DST(2, 0) = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3) = AVG3(L, K, J);
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 // This serves as a wrapper function, so that all the prediction functions
@@ -811,7 +883,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
   intra_pred_sized(type, 8) \
   intra_pred_sized(type, 16) \
   intra_pred_sized(type, 32) \
-  intra_pred_highbd_sized(type, 4) \
   intra_pred_highbd_sized(type, 8) \
   intra_pred_highbd_sized(type, 16) \
   intra_pred_highbd_sized(type, 32)
@@ -832,11 +903,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
 intra_pred_no_4x4(d207)
 intra_pred_no_4x4(d63)
 intra_pred_no_4x4(d45)
-#if CONFIG_MISC_FIXES
-intra_pred_allsizes(d207e)
-intra_pred_allsizes(d63e)
-intra_pred_no_4x4(d45e)
-#endif
 intra_pred_no_4x4(d117)
 intra_pred_no_4x4(d135)
 intra_pred_no_4x4(d153)
diff --git a/libs/libvpx/vpx_dsp/inv_txfm.c b/libs/libvpx/vpx_dsp/inv_txfm.c
index 1526663d89..0194aa1e18 100644
--- a/libs/libvpx/vpx_dsp/inv_txfm.c
+++ b/libs/libvpx/vpx_dsp/inv_txfm.c
@@ -9,6 +9,7 @@
  */
 
 #include <math.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include "./vpx_dsp_rtcd.h"
@@ -66,7 +67,7 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   }
 }
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
+void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
   int i;
   tran_high_t a1, e1;
   tran_low_t tmp[4];
@@ -83,179 +84,28 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
   for (i = 0; i < 4; i++) {
     e1 = ip[0] >> 1;
     a1 = ip[0] - e1;
-    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
-    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
-    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
-    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
+    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
+    dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
+    dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
+    dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
     ip++;
     dest++;
   }
 }
 
-void idct4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step[4];
-  tran_high_t temp1, temp2;
-  // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step[3] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  output[0] = WRAPLOW(step[0] + step[3]);
-  output[1] = WRAPLOW(step[1] + step[2]);
-  output[2] = WRAPLOW(step[1] - step[2]);
-  output[3] = WRAPLOW(step[0] - step[3]);
-}
-
-void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[4], temp_out[4];
-
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    idct4_c(input, outptr);
-    input += 4;
-    outptr += 4;
-  }
-
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-    idct4_c(temp_in, temp_out);
-    for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
-    }
-  }
-}
-
-void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
-                         int dest_stride) {
-  int i;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  for (i = 0; i < 4; i++) {
-    dest[0] = clip_pixel_add(dest[0], a1);
-    dest[1] = clip_pixel_add(dest[1], a1);
-    dest[2] = clip_pixel_add(dest[2], a1);
-    dest[3] = clip_pixel_add(dest[3], a1);
-    dest += dest_stride;
-  }
-}
-
-void idct8_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[8], step2[8];
-  tran_high_t temp1, temp2;
-  // stage 1
-  step1[0] = input[0];
-  step1[2] = input[4];
-  step1[1] = input[2];
-  step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  temp1 = (step1[0] + step1[2]) * cospi_16_64;
-  temp2 = (step1[0] - step1[2]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  // stage 3
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7]);
-  output[1] = WRAPLOW(step1[1] + step1[6]);
-  output[2] = WRAPLOW(step1[2] + step1[5]);
-  output[3] = WRAPLOW(step1[3] + step1[4]);
-  output[4] = WRAPLOW(step1[3] - step1[4]);
-  output[5] = WRAPLOW(step1[2] - step1[5]);
-  output[6] = WRAPLOW(step1[1] - step1[6]);
-  output[7] = WRAPLOW(step1[0] - step1[7]);
-}
-
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  for (i = 0; i < 8; ++i) {
-    idct8_c(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    idct8_c(temp_in, temp_out);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
-    }
-  }
-}
-
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
   tran_low_t x0 = input[0];
   tran_low_t x1 = input[1];
   tran_low_t x2 = input[2];
   tran_low_t x3 = input[3];
 
   if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
+    memset(output, 0, 4 * sizeof(*output));
     return;
   }
 
+  // 32-bit result is enough for the following multiplications.
   s0 = sinpi_1_9 * x0;
   s1 = sinpi_2_9 * x0;
   s2 = sinpi_3_9 * x1;
@@ -280,9 +130,71 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) {
   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
 }
 
+void idct4_c(const tran_low_t *input, tran_low_t *output) {
+  int16_t step[4];
+  tran_high_t temp1, temp2;
+
+  // stage 1
+  temp1 = ((int16_t)input[0] + (int16_t)input[2]) * cospi_16_64;
+  temp2 = ((int16_t)input[0] - (int16_t)input[2]) * cospi_16_64;
+  step[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step[1] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = (int16_t)input[1] * cospi_24_64 - (int16_t)input[3] * cospi_8_64;
+  temp2 = (int16_t)input[1] * cospi_8_64 + (int16_t)input[3] * cospi_24_64;
+  step[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step[3] = WRAPLOW(dct_const_round_shift(temp2));
+
+  // stage 2
+  output[0] = WRAPLOW(step[0] + step[3]);
+  output[1] = WRAPLOW(step[1] + step[2]);
+  output[2] = WRAPLOW(step[1] - step[2]);
+  output[3] = WRAPLOW(step[0] - step[3]);
+}
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[4], temp_out[4];
+
+  // Rows
+  for (i = 0; i < 4; ++i) {
+    idct4_c(input, outptr);
+    input += 4;
+    outptr += 4;
+  }
+
+  // Columns
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
+    idct4_c(temp_in, temp_out);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+    }
+  }
+}
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i;
+  tran_high_t a1;
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  for (i = 0; i < 4; i++) {
+    dest[0] = clip_pixel_add(dest[0], a1);
+    dest[1] = clip_pixel_add(dest[1], a1);
+    dest[2] = clip_pixel_add(dest[2], a1);
+    dest[3] = clip_pixel_add(dest[3], a1);
+    dest += stride;
+  }
+}
+
 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
-
   tran_high_t x0 = input[7];
   tran_high_t x1 = input[0];
   tran_high_t x2 = input[5];
@@ -293,8 +205,7 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x7 = input[6];
 
   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = 0;
+    memset(output, 0, 8 * sizeof(*output));
     return;
   }
 
@@ -357,14 +268,93 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   output[7] = WRAPLOW(-x1);
 }
 
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
+void idct8_c(const tran_low_t *input, tran_low_t *output) {
+  int16_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
+
+  // stage 1
+  step1[0] = (int16_t)input[0];
+  step1[2] = (int16_t)input[4];
+  step1[1] = (int16_t)input[2];
+  step1[3] = (int16_t)input[6];
+  temp1 = (int16_t)input[1] * cospi_28_64 - (int16_t)input[7] * cospi_4_64;
+  temp2 = (int16_t)input[1] * cospi_4_64 + (int16_t)input[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = (int16_t)input[5] * cospi_12_64 - (int16_t)input[3] * cospi_20_64;
+  temp2 = (int16_t)input[5] * cospi_20_64 + (int16_t)input[3] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  // stage 2
+  temp1 = (step1[0] + step1[2]) * cospi_16_64;
+  temp2 = (step1[0] - step1[2]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+  // stage 3
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = WRAPLOW(step1[0] + step1[7]);
+  output[1] = WRAPLOW(step1[1] + step1[6]);
+  output[2] = WRAPLOW(step1[2] + step1[5]);
+  output[3] = WRAPLOW(step1[3] + step1[4]);
+  output[4] = WRAPLOW(step1[3] - step1[4]);
+  output[5] = WRAPLOW(step1[2] - step1[5]);
+  output[6] = WRAPLOW(step1[1] - step1[6]);
+  output[7] = WRAPLOW(step1[0] - step1[7]);
+}
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
   tran_low_t temp_in[8], temp_out[8];
 
   // First transform rows
-  // only first 4 row has non-zero coefs
+  for (i = 0; i < 8; ++i) {
+    idct8_c(input, outptr);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+    idct8_c(temp_in, temp_out);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+    }
+  }
+}
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[8], temp_out[8];
+
+  // First transform rows
+  // Only first 4 row has non-zero coefs
   for (i = 0; i < 4; ++i) {
     idct8_c(input, outptr);
     input += 8;
@@ -382,27 +372,209 @@ void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   }
 }
 
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
+    dest += stride;
+  }
+}
+
+void iadst16_c(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+        x13 | x14 | x15)) {
+    memset(output, 0, 16 * sizeof(*output));
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
+  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
+  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = WRAPLOW(s0 + s4);
+  x1 = WRAPLOW(s1 + s5);
+  x2 = WRAPLOW(s2 + s6);
+  x3 = WRAPLOW(s3 + s7);
+  x4 = WRAPLOW(s0 - s4);
+  x5 = WRAPLOW(s1 - s5);
+  x6 = WRAPLOW(s2 - s6);
+  x7 = WRAPLOW(s3 - s7);
+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+  x8 = WRAPLOW(s8 + s10);
+  x9 = WRAPLOW(s9 + s11);
+  x10 = WRAPLOW(s8 - s10);
+  x11 = WRAPLOW(s9 - s11);
+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
+
+  // stage 4
+  s2 = (-cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (-x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (-x10 + x11);
+  s14 = (-cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
+  x10 = WRAPLOW(dct_const_round_shift(s10));
+  x11 = WRAPLOW(dct_const_round_shift(s11));
+  x14 = WRAPLOW(dct_const_round_shift(s14));
+  x15 = WRAPLOW(dct_const_round_shift(s15));
+
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x8);
+  output[2] = WRAPLOW(x12);
+  output[3] = WRAPLOW(-x4);
+  output[4] = WRAPLOW(x6);
+  output[5] = WRAPLOW(x14);
+  output[6] = WRAPLOW(x10);
+  output[7] = WRAPLOW(x2);
+  output[8] = WRAPLOW(x3);
+  output[9] = WRAPLOW(x11);
+  output[10] = WRAPLOW(x15);
+  output[11] = WRAPLOW(x7);
+  output[12] = WRAPLOW(x5);
+  output[13] = WRAPLOW(-x13);
+  output[14] = WRAPLOW(x9);
+  output[15] = WRAPLOW(-x1);
+}
+
 void idct16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[16], step2[16];
+  int16_t step1[16], step2[16];
   tran_high_t temp1, temp2;
 
   // stage 1
-  step1[0] = input[0 / 2];
-  step1[1] = input[16 / 2];
-  step1[2] = input[8 / 2];
-  step1[3] = input[24 / 2];
-  step1[4] = input[4 / 2];
-  step1[5] = input[20 / 2];
-  step1[6] = input[12 / 2];
-  step1[7] = input[28 / 2];
-  step1[8] = input[2 / 2];
-  step1[9] = input[18 / 2];
-  step1[10] = input[10 / 2];
-  step1[11] = input[26 / 2];
-  step1[12] = input[6 / 2];
-  step1[13] = input[22 / 2];
-  step1[14] = input[14 / 2];
-  step1[15] = input[30 / 2];
+  step1[0] = (int16_t)input[0 / 2];
+  step1[1] = (int16_t)input[16 / 2];
+  step1[2] = (int16_t)input[8 / 2];
+  step1[3] = (int16_t)input[24 / 2];
+  step1[4] = (int16_t)input[4 / 2];
+  step1[5] = (int16_t)input[20 / 2];
+  step1[6] = (int16_t)input[12 / 2];
+  step1[7] = (int16_t)input[28 / 2];
+  step1[8] = (int16_t)input[2 / 2];
+  step1[9] = (int16_t)input[18 / 2];
+  step1[10] = (int16_t)input[10 / 2];
+  step1[11] = (int16_t)input[26 / 2];
+  step1[12] = (int16_t)input[6 / 2];
+  step1[13] = (int16_t)input[22 / 2];
+  step1[14] = (int16_t)input[14 / 2];
+  step1[15] = (int16_t)input[30 / 2];
 
   // stage 2
   step2[0] = step1[0];
@@ -549,9 +721,9 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) {
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
                              int stride) {
+  int i, j;
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
 
   // First transform rows
@@ -572,182 +744,37 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
   }
 }
 
-void iadst16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int i, j;
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
 
-  tran_high_t x0 = input[15];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[13];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[11];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[9];
-  tran_high_t x7 = input[6];
-  tran_high_t x8 = input[7];
-  tran_high_t x9 = input[8];
-  tran_high_t x10 = input[5];
-  tran_high_t x11 = input[10];
-  tran_high_t x12 = input[3];
-  tran_high_t x13 = input[12];
-  tran_high_t x14 = input[1];
-  tran_high_t x15 = input[14];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
-        x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = output[8] = output[9] = output[10] =
-            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
-    return;
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 8x8 area, we only need to calculate first 8 rows here.
+  for (i = 0; i < 8; ++i) {
+    idct16_c(input, outptr);
+    input += 16;
+    outptr += 16;
   }
 
-  // stage 1
-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
-  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
-  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = WRAPLOW(s0 + s4);
-  x1 = WRAPLOW(s1 + s5);
-  x2 = WRAPLOW(s2 + s6);
-  x3 = WRAPLOW(s3 + s7);
-  x4 = WRAPLOW(s0 - s4);
-  x5 = WRAPLOW(s1 - s5);
-  x6 = WRAPLOW(s2 - s6);
-  x7 = WRAPLOW(s3 - s7);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
-  x0 = WRAPLOW(s0 + s2);
-  x1 = WRAPLOW(s1 + s3);
-  x2 = WRAPLOW(s0 - s2);
-  x3 = WRAPLOW(s1 - s3);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
-  x8 = WRAPLOW(s8 + s10);
-  x9 = WRAPLOW(s9 + s11);
-  x10 = WRAPLOW(s8 - s10);
-  x11 = WRAPLOW(s9 - s11);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
-
-  // stage 4
-  s2 = (-cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (-cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = WRAPLOW(dct_const_round_shift(s2));
-  x3 = WRAPLOW(dct_const_round_shift(s3));
-  x6 = WRAPLOW(dct_const_round_shift(s6));
-  x7 = WRAPLOW(dct_const_round_shift(s7));
-  x10 = WRAPLOW(dct_const_round_shift(s10));
-  x11 = WRAPLOW(dct_const_round_shift(s11));
-  x14 = WRAPLOW(dct_const_round_shift(s14));
-  x15 = WRAPLOW(dct_const_round_shift(s15));
-
-  output[0] = WRAPLOW(x0);
-  output[1] = WRAPLOW(-x8);
-  output[2] = WRAPLOW(x12);
-  output[3] = WRAPLOW(-x4);
-  output[4] = WRAPLOW(x6);
-  output[5] = WRAPLOW(x14);
-  output[6] = WRAPLOW(x10);
-  output[7] = WRAPLOW(x2);
-  output[8] = WRAPLOW(x3);
-  output[9] = WRAPLOW(x11);
-  output[10] = WRAPLOW(x15);
-  output[11] = WRAPLOW(x7);
-  output[12] = WRAPLOW(x5);
-  output[13] = WRAPLOW(-x13);
-  output[14] = WRAPLOW(x9);
-  output[15] = WRAPLOW(-x1);
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+    idct16_c(temp_in, temp_out);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
 }
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
                             int stride) {
+  int i, j;
   tran_low_t out[16 * 16] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
 
   // First transform rows. Since all non-zero dct coefficients are in
@@ -772,7 +799,9 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
@@ -782,64 +811,64 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 }
 
 void idct32_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[32], step2[32];
+  int16_t step1[32], step2[32];
   tran_high_t temp1, temp2;
 
   // stage 1
-  step1[0] = input[0];
-  step1[1] = input[16];
-  step1[2] = input[8];
-  step1[3] = input[24];
-  step1[4] = input[4];
-  step1[5] = input[20];
-  step1[6] = input[12];
-  step1[7] = input[28];
-  step1[8] = input[2];
-  step1[9] = input[18];
-  step1[10] = input[10];
-  step1[11] = input[26];
-  step1[12] = input[6];
-  step1[13] = input[22];
-  step1[14] = input[14];
-  step1[15] = input[30];
+  step1[0] = (int16_t)input[0];
+  step1[1] = (int16_t)input[16];
+  step1[2] = (int16_t)input[8];
+  step1[3] = (int16_t)input[24];
+  step1[4] = (int16_t)input[4];
+  step1[5] = (int16_t)input[20];
+  step1[6] = (int16_t)input[12];
+  step1[7] = (int16_t)input[28];
+  step1[8] = (int16_t)input[2];
+  step1[9] = (int16_t)input[18];
+  step1[10] = (int16_t)input[10];
+  step1[11] = (int16_t)input[26];
+  step1[12] = (int16_t)input[6];
+  step1[13] = (int16_t)input[22];
+  step1[14] = (int16_t)input[14];
+  step1[15] = (int16_t)input[30];
 
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+  temp1 = (int16_t)input[1] * cospi_31_64 - (int16_t)input[31] * cospi_1_64;
+  temp2 = (int16_t)input[1] * cospi_1_64 + (int16_t)input[31] * cospi_31_64;
   step1[16] = WRAPLOW(dct_const_round_shift(temp1));
   step1[31] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+  temp1 = (int16_t)input[17] * cospi_15_64 - (int16_t)input[15] * cospi_17_64;
+  temp2 = (int16_t)input[17] * cospi_17_64 + (int16_t)input[15] * cospi_15_64;
   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+  temp1 = (int16_t)input[9] * cospi_23_64 - (int16_t)input[23] * cospi_9_64;
+  temp2 = (int16_t)input[9] * cospi_9_64 + (int16_t)input[23] * cospi_23_64;
   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+  temp1 = (int16_t)input[25] * cospi_7_64 - (int16_t)input[7] * cospi_25_64;
+  temp2 = (int16_t)input[25] * cospi_25_64 + (int16_t)input[7] * cospi_7_64;
   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+  temp1 = (int16_t)input[5] * cospi_27_64 - (int16_t)input[27] * cospi_5_64;
+  temp2 = (int16_t)input[5] * cospi_5_64 + (int16_t)input[27] * cospi_27_64;
   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+  temp1 = (int16_t)input[21] * cospi_11_64 - (int16_t)input[11] * cospi_21_64;
+  temp2 = (int16_t)input[21] * cospi_21_64 + (int16_t)input[11] * cospi_11_64;
   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+  temp1 = (int16_t)input[13] * cospi_19_64 - (int16_t)input[19] * cospi_13_64;
+  temp2 = (int16_t)input[13] * cospi_13_64 + (int16_t)input[19] * cospi_19_64;
   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+  temp1 = (int16_t)input[29] * cospi_3_64 - (int16_t)input[3] * cospi_29_64;
+  temp2 = (int16_t)input[29] * cospi_29_64 + (int16_t)input[3] * cospi_3_64;
   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
 
@@ -1150,23 +1179,17 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
                               int stride) {
+  int i, j;
   tran_low_t out[32 * 32];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
 
   // Rows
   for (i = 0; i < 32; ++i) {
-    int16_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    int16_t zero_coeff = 0;
+    for (j = 0; j < 32; ++j) zero_coeff |= input[j];
 
-    if (zero_coeff[0] | zero_coeff[1])
+    if (zero_coeff)
       idct32_c(input, outptr);
     else
       memset(outptr, 0, sizeof(tran_low_t) * 32);
@@ -1187,13 +1210,13 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
                              int stride) {
+  int i, j;
   tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
 
   // Rows
-  // only upper-left 16x16 has non-zero coeff
+  // Only upper-left 16x16 has non-zero coeff
   for (i = 0; i < 16; ++i) {
     idct32_c(input, outptr);
     input += 32;
@@ -1213,13 +1236,13 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
                             int stride) {
+  int i, j;
   tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
 
   // Rows
-  // only upper-left 8x8 has non-zero coeff
+  // Only upper-left 8x8 has non-zero coeff
   for (i = 0; i < 8; ++i) {
     idct32_c(input, outptr);
     input += 32;
@@ -1240,8 +1263,9 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
@@ -1252,7 +1276,20 @@ void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+
+// 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
+// transform amplify bits + 1 bit for contingency in rounding and quantizing
+#define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
+
+static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
+                                              int size) {
+  int i;
+  for (i = 0; i < size; ++i)
+    if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
+  return 0;
+}
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
                                  int stride, int bd) {
   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
      0.5 shifts per pixel. */
@@ -1261,7 +1298,6 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
   tran_high_t a1, b1, c1, d1, e1;
   const tran_low_t *ip = input;
   tran_low_t *op = output;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   for (i = 0; i < 4; i++) {
     a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -1310,14 +1346,13 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
-                                int dest_stride, int bd) {
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,
+                                int stride, int bd) {
   int i;
   tran_high_t a1, e1;
   tran_low_t tmp[4];
   const tran_low_t *ip = in;
   tran_low_t *op = tmp;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   (void)bd;
 
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -1330,32 +1365,84 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
   for (i = 0; i < 4; i++) {
     e1 = ip[0] >> 1;
     a1 = ip[0] - e1;
-    dest[dest_stride * 0] =
-        highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
-    dest[dest_stride * 1] =
-        highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
-    dest[dest_stride * 2] =
-        highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
-    dest[dest_stride * 3] =
-        highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
+    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
+    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
+    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
     ip++;
     dest++;
   }
 }
 
+void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+  tran_low_t x0 = input[0];
+  tran_low_t x1 = input[1];
+  tran_low_t x2 = input[2];
+  tran_low_t x3 = input[3];
+  (void)bd;
+
+  if (detect_invalid_highbd_input(input, 4)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+    assert(0 && "invalid highbd txfm input");
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    memset(output, 0, sizeof(*output) * 4);
+    return;
+  }
+
+  if (!(x0 | x1 | x2 | x3)) {
+    memset(output, 0, 4 * sizeof(*output));
+    return;
+  }
+
+  s0 = (tran_high_t)sinpi_1_9 * x0;
+  s1 = (tran_high_t)sinpi_2_9 * x0;
+  s2 = (tran_high_t)sinpi_3_9 * x1;
+  s3 = (tran_high_t)sinpi_4_9 * x2;
+  s4 = (tran_high_t)sinpi_1_9 * x2;
+  s5 = (tran_high_t)sinpi_2_9 * x3;
+  s6 = (tran_high_t)sinpi_4_9 * x3;
+  s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
+
+  s0 = s0 + s3 + s5;
+  s1 = s1 - s4 - s6;
+  s3 = s2;
+  s2 = sinpi_3_9 * s7;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
+  output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
+  output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+  output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
+}
+
 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t step[4];
   tran_high_t temp1, temp2;
   (void)bd;
+
+  if (detect_invalid_highbd_input(input, 4)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+    assert(0 && "invalid highbd txfm input");
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    memset(output, 0, sizeof(*output) * 4);
+    return;
+  }
+
   // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64;
+  temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64;
+  step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 =
+      input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64;
+  temp2 =
+      input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64;
+  step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 2
   output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
@@ -1364,13 +1451,12 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
 }
 
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
                                  int stride, int bd) {
+  int i, j;
   tran_low_t out[4 * 4];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[4], temp_out[4];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // Rows
   for (i = 0; i < 4; ++i) {
@@ -1390,15 +1476,15 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                int dest_stride, int bd) {
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
+                                int stride, int bd) {
   int i;
   tran_high_t a1;
-  tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
 
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -1406,26 +1492,123 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
     dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
     dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
     dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
-    dest += dest_stride;
+    dest += stride;
   }
 }
 
+void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+  tran_low_t x0 = input[7];
+  tran_low_t x1 = input[0];
+  tran_low_t x2 = input[5];
+  tran_low_t x3 = input[2];
+  tran_low_t x4 = input[3];
+  tran_low_t x5 = input[4];
+  tran_low_t x6 = input[1];
+  tran_low_t x7 = input[6];
+  (void)bd;
+
+  if (detect_invalid_highbd_input(input, 8)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+    assert(0 && "invalid highbd txfm input");
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    memset(output, 0, sizeof(*output) * 8);
+    return;
+  }
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    memset(output, 0, 8 * sizeof(*output));
+    return;
+  }
+
+  // stage 1
+  s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1;
+  s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1;
+  s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3;
+  s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3;
+  s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5;
+  s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5;
+  s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7;
+  s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7;
+
+  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
+  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5;
+  s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5;
+  s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7;
+  s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7;
+
+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+
+  // stage 3
+  s2 = (tran_high_t)cospi_16_64 * (x2 + x3);
+  s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
+  s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
+  s7 = (tran_high_t)cospi_16_64 * (x6 - x7);
+
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
+
+  output[0] = HIGHBD_WRAPLOW(x0, bd);
+  output[1] = HIGHBD_WRAPLOW(-x4, bd);
+  output[2] = HIGHBD_WRAPLOW(x6, bd);
+  output[3] = HIGHBD_WRAPLOW(-x2, bd);
+  output[4] = HIGHBD_WRAPLOW(x3, bd);
+  output[5] = HIGHBD_WRAPLOW(-x7, bd);
+  output[6] = HIGHBD_WRAPLOW(x5, bd);
+  output[7] = HIGHBD_WRAPLOW(-x1, bd);
+}
+
 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t step1[8], step2[8];
   tran_high_t temp1, temp2;
+
+  if (detect_invalid_highbd_input(input, 8)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+    assert(0 && "invalid highbd txfm input");
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    memset(output, 0, sizeof(*output) * 8);
+    return;
+  }
+
   // stage 1
   step1[0] = input[0];
   step1[2] = input[4];
   step1[1] = input[2];
   step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 =
+      input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64;
+  temp2 =
+      input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64;
+  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 =
+      input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64;
+  temp2 =
+      input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64;
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 2 & stage 3 - even half
   vpx_highbd_idct4_c(step1, step1, bd);
@@ -1438,10 +1621,10 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   // stage 3 - odd half
   step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   // stage 4
@@ -1455,22 +1638,21 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
 }
 
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
                                  int stride, int bd) {
+  int i, j;
   tran_low_t out[8 * 8];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[8], temp_out[8];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // First transform rows.
+  // First transform rows
   for (i = 0; i < 8; ++i) {
     vpx_highbd_idct8_c(input, outptr, bd);
     input += 8;
     outptr += 8;
   }
 
-  // Then transform columns.
+  // Then transform columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     vpx_highbd_idct8_c(temp_in, temp_out, bd);
@@ -1481,14 +1663,41 @@ void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,
+                                 int stride, int bd) {
+  int i, j;
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[8], temp_out[8];
+
+  // First transform rows
+  // Only first 4 row has non-zero coefs
+  for (i = 0; i < 4; ++i) {
+    vpx_highbd_idct8_c(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+    vpx_highbd_idct8_c(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
                                 int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -1496,145 +1705,181 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_low_t x0 = input[0];
-  tran_low_t x1 = input[1];
-  tran_low_t x2 = input[2];
-  tran_low_t x3 = input[3];
+void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+  tran_low_t x0 = input[15];
+  tran_low_t x1 = input[0];
+  tran_low_t x2 = input[13];
+  tran_low_t x3 = input[2];
+  tran_low_t x4 = input[11];
+  tran_low_t x5 = input[4];
+  tran_low_t x6 = input[9];
+  tran_low_t x7 = input[6];
+  tran_low_t x8 = input[7];
+  tran_low_t x9 = input[8];
+  tran_low_t x10 = input[5];
+  tran_low_t x11 = input[10];
+  tran_low_t x12 = input[3];
+  tran_low_t x13 = input[12];
+  tran_low_t x14 = input[1];
+  tran_low_t x15 = input[14];
   (void)bd;
 
-  if (!(x0 | x1 | x2 | x3)) {
-    memset(output, 0, 4 * sizeof(*output));
+  if (detect_invalid_highbd_input(input, 16)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+    assert(0 && "invalid highbd txfm input");
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    memset(output, 0, sizeof(*output) * 16);
     return;
   }
 
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
-
-  s0 = s0 + s3 + s5;
-  s1 = s1 - s4 - s6;
-  s3 = s2;
-  s2 = sinpi_3_9 * s7;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
-  output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
-  output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
-}
-
-void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_low_t x0 = input[7];
-  tran_low_t x1 = input[0];
-  tran_low_t x2 = input[5];
-  tran_low_t x3 = input[2];
-  tran_low_t x4 = input[3];
-  tran_low_t x5 = input[4];
-  tran_low_t x6 = input[1];
-  tran_low_t x7 = input[6];
-  (void)bd;
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    memset(output, 0, 8 * sizeof(*output));
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+        x13 | x14 | x15)) {
+    memset(output, 0, 16 * sizeof(*output));
     return;
   }
 
   // stage 1
-  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
-  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+  s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64;
+  s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64;
+  s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64;
+  s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64;
+  s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64;
+  s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64;
+  s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64;
+  s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64;
+  s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64;
+  s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64;
+  s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64;
+  s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64;
+  s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64;
+  s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64;
+  s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64;
+  s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64;
 
-  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
-  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
+  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
+  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
+  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
+  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
 
   // stage 2
   s0 = x0;
   s1 = x1;
   s2 = x2;
   s3 = x3;
-  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
-  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
-  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64;
+  s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64;
+  s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64;
+  s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64;
+  s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64;
+  s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64;
+  s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64;
+  s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64;
+
+  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
+  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
+  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
+  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
+  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
+  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
+  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
+  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
+  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64;
+  s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64;
+  s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64;
+  s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64;
+  s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64;
+  s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64;
+  s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64;
 
   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
+  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
+  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
+  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
 
-  // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
+  // stage 4
+  s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3);
+  s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
+  s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
+  s7 = (tran_high_t)cospi_16_64 * (-x6 + x7);
+  s10 = (tran_high_t)cospi_16_64 * (x10 + x11);
+  s11 = (tran_high_t)cospi_16_64 * (-x10 + x11);
+  s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15);
+  s15 = (tran_high_t)cospi_16_64 * (x14 - x15);
 
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
 
   output[0] = HIGHBD_WRAPLOW(x0, bd);
-  output[1] = HIGHBD_WRAPLOW(-x4, bd);
-  output[2] = HIGHBD_WRAPLOW(x6, bd);
-  output[3] = HIGHBD_WRAPLOW(-x2, bd);
-  output[4] = HIGHBD_WRAPLOW(x3, bd);
-  output[5] = HIGHBD_WRAPLOW(-x7, bd);
-  output[6] = HIGHBD_WRAPLOW(x5, bd);
-  output[7] = HIGHBD_WRAPLOW(-x1, bd);
-}
-
-void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // First transform rows.
-  // Only first 4 row has non-zero coefs.
-  for (i = 0; i < 4; ++i) {
-    vpx_highbd_idct8_c(input, outptr, bd);
-    input += 8;
-    outptr += 8;
-  }
-  // Then transform columns.
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    vpx_highbd_idct8_c(temp_in, temp_out, bd);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-    }
-  }
+  output[1] = HIGHBD_WRAPLOW(-x8, bd);
+  output[2] = HIGHBD_WRAPLOW(x12, bd);
+  output[3] = HIGHBD_WRAPLOW(-x4, bd);
+  output[4] = HIGHBD_WRAPLOW(x6, bd);
+  output[5] = HIGHBD_WRAPLOW(x14, bd);
+  output[6] = HIGHBD_WRAPLOW(x10, bd);
+  output[7] = HIGHBD_WRAPLOW(x2, bd);
+  output[8] = HIGHBD_WRAPLOW(x3, bd);
+  output[9] = HIGHBD_WRAPLOW(x11, bd);
+  output[10] = HIGHBD_WRAPLOW(x15, bd);
+  output[11] = HIGHBD_WRAPLOW(x7, bd);
+  output[12] = HIGHBD_WRAPLOW(x5, bd);
+  output[13] = HIGHBD_WRAPLOW(-x13, bd);
+  output[14] = HIGHBD_WRAPLOW(x9, bd);
+  output[15] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
@@ -1642,6 +1887,14 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_high_t temp1, temp2;
   (void)bd;
 
+  if (detect_invalid_highbd_input(input, 16)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+    assert(0 && "invalid highbd txfm input");
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    memset(output, 0, sizeof(*output) * 16);
+    return;
+  }
+
   // stage 1
   step1[0] = input[0 / 2];
   step1[1] = input[16 / 2];
@@ -1670,25 +1923,33 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[6] = step1[6];
   step2[7] = step1[7];
 
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 =
+      step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
+  temp2 =
+      step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
+  step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = step1[9] * (tran_high_t)cospi_14_64 -
+          step1[14] * (tran_high_t)cospi_18_64;
+  temp2 = step1[9] * (tran_high_t)cospi_18_64 +
+          step1[14] * (tran_high_t)cospi_14_64;
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = step1[10] * (tran_high_t)cospi_22_64 -
+          step1[13] * (tran_high_t)cospi_10_64;
+  temp2 = step1[10] * (tran_high_t)cospi_10_64 +
+          step1[13] * (tran_high_t)cospi_22_64;
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = step1[11] * (tran_high_t)cospi_6_64 -
+          step1[12] * (tran_high_t)cospi_26_64;
+  temp2 = step1[11] * (tran_high_t)cospi_26_64 +
+          step1[12] * (tran_high_t)cospi_6_64;
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -1696,14 +1957,18 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[2] = step2[2];
   step1[3] = step2[3];
 
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 =
+      step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
+  temp2 =
+      step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
+  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 =
+      step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
+  temp2 =
+      step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
@@ -1715,14 +1980,16 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
+  step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 =
+      step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
+  temp2 =
+      step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
+  step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
@@ -1730,14 +1997,18 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   step2[8] = step1[8];
   step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
+          step1[14] * (tran_high_t)cospi_24_64;
+  temp2 =
+      step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
+          step1[13] * (tran_high_t)cospi_8_64;
+  temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
+          step1[13] * (tran_high_t)cospi_24_64;
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
@@ -1747,10 +2018,10 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
@@ -1773,14 +2044,14 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
@@ -1803,22 +2074,21 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
 }
 
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
+  int i, j;
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // First transform rows.
+  // First transform rows
   for (i = 0; i < 16; ++i) {
     vpx_highbd_idct16_c(input, outptr, bd);
     input += 16;
     outptr += 16;
   }
 
-  // Then transform columns.
+  // Then transform columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     vpx_highbd_idct16_c(temp_in, temp_out, bd);
@@ -1829,183 +2099,40 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-  tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
-  tran_low_t x0 = input[15];
-  tran_low_t x1 = input[0];
-  tran_low_t x2 = input[13];
-  tran_low_t x3 = input[2];
-  tran_low_t x4 = input[11];
-  tran_low_t x5 = input[4];
-  tran_low_t x6 = input[9];
-  tran_low_t x7 = input[6];
-  tran_low_t x8 = input[7];
-  tran_low_t x9 = input[8];
-  tran_low_t x10 = input[5];
-  tran_low_t x11 = input[10];
-  tran_low_t x12 = input[3];
-  tran_low_t x13 = input[12];
-  tran_low_t x14 = input[1];
-  tran_low_t x15 = input[14];
-  (void)bd;
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
-        x13 | x14 | x15)) {
-    memset(output, 0, 16 * sizeof(*output));
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
-  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
-  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
-  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
-  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
-  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
-  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
-  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
-  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
-  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
-  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
-  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
-  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
-  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
-  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
-  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
-  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
-  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
-  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
-  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
-  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
-  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
-
-  // stage 4
-  s2 = (-cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (-cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
-
-  output[0] = HIGHBD_WRAPLOW(x0, bd);
-  output[1] = HIGHBD_WRAPLOW(-x8, bd);
-  output[2] = HIGHBD_WRAPLOW(x12, bd);
-  output[3] = HIGHBD_WRAPLOW(-x4, bd);
-  output[4] = HIGHBD_WRAPLOW(x6, bd);
-  output[5] = HIGHBD_WRAPLOW(x14, bd);
-  output[6] = HIGHBD_WRAPLOW(x10, bd);
-  output[7] = HIGHBD_WRAPLOW(x2, bd);
-  output[8] = HIGHBD_WRAPLOW(x3, bd);
-  output[9] = HIGHBD_WRAPLOW(x11, bd);
-  output[10] = HIGHBD_WRAPLOW(x15, bd);
-  output[11] = HIGHBD_WRAPLOW(x7, bd);
-  output[12] = HIGHBD_WRAPLOW(x5, bd);
-  output[13] = HIGHBD_WRAPLOW(-x13, bd);
-  output[14] = HIGHBD_WRAPLOW(x9, bd);
-  output[15] = HIGHBD_WRAPLOW(-x1, bd);
-}
-
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,
                                    int stride, int bd) {
+  int i, j;
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 8x8 area, we only need to calculate first 8 rows here.
+  for (i = 0; i < 8; ++i) {
+    vpx_highbd_idct16_c(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    uint16_t *destT = dest;
+    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+    vpx_highbd_idct16_c(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j) {
+      destT[i] = highbd_clip_pixel_add(destT[i],
+                                       ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      destT += stride;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int bd) {
+  int i, j;
   tran_low_t out[16 * 16] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
@@ -2015,7 +2142,7 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
     outptr += 16;
   }
 
-  // Then transform columns.
+  // Then transform columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     vpx_highbd_idct16_c(temp_in, temp_out, bd);
@@ -2026,15 +2153,15 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
                                   int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
 
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -2048,6 +2175,14 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   tran_high_t temp1, temp2;
   (void)bd;
 
+  if (detect_invalid_highbd_input(input, 32)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+    assert(0 && "invalid highbd txfm input");
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    memset(output, 0, sizeof(*output) * 32);
+    return;
+  }
+
   // stage 1
   step1[0] = input[0];
   step1[1] = input[16];
@@ -2066,45 +2201,61 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[14] = input[14];
   step1[15] = input[30];
 
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 =
+      input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64;
+  temp2 =
+      input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64;
+  step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = input[17] * (tran_high_t)cospi_15_64 -
+          input[15] * (tran_high_t)cospi_17_64;
+  temp2 = input[17] * (tran_high_t)cospi_17_64 +
+          input[15] * (tran_high_t)cospi_15_64;
+  step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 =
+      input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64;
+  temp2 =
+      input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64;
+  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 =
+      input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64;
+  temp2 =
+      input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64;
+  step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 =
+      input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64;
+  temp2 =
+      input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64;
+  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = input[21] * (tran_high_t)cospi_11_64 -
+          input[11] * (tran_high_t)cospi_21_64;
+  temp2 = input[21] * (tran_high_t)cospi_21_64 +
+          input[11] * (tran_high_t)cospi_11_64;
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = input[13] * (tran_high_t)cospi_19_64 -
+          input[19] * (tran_high_t)cospi_13_64;
+  temp2 = input[13] * (tran_high_t)cospi_13_64 +
+          input[19] * (tran_high_t)cospi_19_64;
+  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 =
+      input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64;
+  temp2 =
+      input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64;
+  step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 2
   step2[0] = step1[0];
@@ -2116,25 +2267,33 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step2[6] = step1[6];
   step2[7] = step1[7];
 
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 =
+      step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
+  temp2 =
+      step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
+  step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = step1[9] * (tran_high_t)cospi_14_64 -
+          step1[14] * (tran_high_t)cospi_18_64;
+  temp2 = step1[9] * (tran_high_t)cospi_18_64 +
+          step1[14] * (tran_high_t)cospi_14_64;
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = step1[10] * (tran_high_t)cospi_22_64 -
+          step1[13] * (tran_high_t)cospi_10_64;
+  temp2 = step1[10] * (tran_high_t)cospi_10_64 +
+          step1[13] * (tran_high_t)cospi_22_64;
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = step1[11] * (tran_high_t)cospi_6_64 -
+          step1[12] * (tran_high_t)cospi_26_64;
+  temp2 = step1[11] * (tran_high_t)cospi_26_64 +
+          step1[12] * (tran_high_t)cospi_6_64;
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
   step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
@@ -2159,14 +2318,18 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[2] = step2[2];
   step1[3] = step2[3];
 
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 =
+      step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
+  temp2 =
+      step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
+  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 =
+      step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
+  temp2 =
+      step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
@@ -2179,38 +2342,48 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   step1[16] = step2[16];
   step1[31] = step2[31];
-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = -step2[17] * (tran_high_t)cospi_4_64 +
+          step2[30] * (tran_high_t)cospi_28_64;
+  temp2 = step2[17] * (tran_high_t)cospi_28_64 +
+          step2[30] * (tran_high_t)cospi_4_64;
+  step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = -step2[18] * (tran_high_t)cospi_28_64 -
+          step2[29] * (tran_high_t)cospi_4_64;
+  temp2 = -step2[18] * (tran_high_t)cospi_4_64 +
+          step2[29] * (tran_high_t)cospi_28_64;
+  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = -step2[21] * (tran_high_t)cospi_20_64 +
+          step2[26] * (tran_high_t)cospi_12_64;
+  temp2 = step2[21] * (tran_high_t)cospi_12_64 +
+          step2[26] * (tran_high_t)cospi_20_64;
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = -step2[22] * (tran_high_t)cospi_12_64 -
+          step2[25] * (tran_high_t)cospi_20_64;
+  temp2 = -step2[22] * (tran_high_t)cospi_20_64 +
+          step2[25] * (tran_high_t)cospi_12_64;
+  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
   step1[28] = step2[28];
 
   // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
+  step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 =
+      step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
+  temp2 =
+      step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
+  step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
@@ -2218,14 +2391,18 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   step2[8] = step1[8];
   step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
+          step1[14] * (tran_high_t)cospi_24_64;
+  temp2 =
+      step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
+          step1[13] * (tran_high_t)cospi_8_64;
+  temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
+          step1[13] * (tran_high_t)cospi_24_64;
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
@@ -2253,10 +2430,10 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
@@ -2270,22 +2447,30 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   step1[16] = step2[16];
   step1[17] = step2[17];
-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = -step2[18] * (tran_high_t)cospi_8_64 +
+          step2[29] * (tran_high_t)cospi_24_64;
+  temp2 = step2[18] * (tran_high_t)cospi_24_64 +
+          step2[29] * (tran_high_t)cospi_8_64;
+  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = -step2[19] * (tran_high_t)cospi_8_64 +
+          step2[28] * (tran_high_t)cospi_24_64;
+  temp2 = step2[19] * (tran_high_t)cospi_24_64 +
+          step2[28] * (tran_high_t)cospi_8_64;
+  step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = -step2[20] * (tran_high_t)cospi_24_64 -
+          step2[27] * (tran_high_t)cospi_8_64;
+  temp2 = -step2[20] * (tran_high_t)cospi_8_64 +
+          step2[27] * (tran_high_t)cospi_24_64;
+  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = -step2[21] * (tran_high_t)cospi_24_64 -
+          step2[26] * (tran_high_t)cospi_8_64;
+  temp2 = -step2[21] * (tran_high_t)cospi_8_64 +
+          step2[26] * (tran_high_t)cospi_24_64;
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -2304,14 +2489,14 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
@@ -2355,22 +2540,22 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[17] = step2[17];
   step1[18] = step2[18];
   step1[19] = step2[19];
-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
-  temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
-  temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
-  temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
-  temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
+  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
+  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
+  step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
@@ -2411,26 +2596,19 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
 }
 
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,
                                      int stride, int bd) {
+  int i, j;
   tran_low_t out[32 * 32];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // Rows
   for (i = 0; i < 32; ++i) {
-    tran_low_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    tran_low_t zero_coeff = 0;
+    for (j = 0; j < 32; ++j) zero_coeff |= input[j];
 
-    if (zero_coeff[0] | zero_coeff[1])
+    if (zero_coeff)
       highbd_idct32_c(input, outptr, bd);
     else
       memset(outptr, 0, sizeof(tran_low_t) * 32);
@@ -2449,21 +2627,49 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
-                                   int stride, int bd) {
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,
+                                    int stride, int bd) {
+  int i, j;
   tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // Rows
-  // Only upper-left 8x8 has non-zero coeff.
+  // Only upper-left 16x16 has non-zero coeff
+  for (i = 0; i < 16; ++i) {
+    highbd_idct32_c(input, outptr, bd);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    uint16_t *destT = dest;
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
+    highbd_idct32_c(temp_in, temp_out, bd);
+    for (j = 0; j < 32; ++j) {
+      destT[i] = highbd_clip_pixel_add(destT[i],
+                                       ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      destT += stride;
+    }
+  }
+}
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int bd) {
+  int i, j;
+  tran_low_t out[32 * 32] = { 0 };
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[32], temp_out[32];
+
+  // Rows
+  // Only upper-left 8x8 has non-zero coeff
   for (i = 0; i < 8; ++i) {
     highbd_idct32_c(input, outptr, bd);
     input += 32;
     outptr += 32;
   }
+
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
@@ -2475,15 +2681,15 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
                                   int stride, int bd) {
   int i, j;
   int a1;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
 
-  tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
@@ -2491,4 +2697,5 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
     dest += stride;
   }
 }
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/inv_txfm.h b/libs/libvpx/vpx_dsp/inv_txfm.h
index e530730d57..13137659fa 100644
--- a/libs/libvpx/vpx_dsp/inv_txfm.h
+++ b/libs/libvpx/vpx_dsp/inv_txfm.h
@@ -57,11 +57,6 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
   (void)bd;
   return input;
 }
-
-static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return (tran_high_t)rv;
-}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_EMULATE_HARDWARE
diff --git a/libs/libvpx/vpx_dsp/loopfilter.c b/libs/libvpx/vpx_dsp/loopfilter.c
index 2f7eff8b05..9866ea37d6 100644
--- a/libs/libvpx/vpx_dsp/loopfilter.c
+++ b/libs/libvpx/vpx_dsp/loopfilter.c
@@ -94,8 +94,8 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
   filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
 
   // save bottom 3 bits so that we round one side +4 and the other +3
-  // if it equals 4 we'll set to adjust by -1 to account for the fact
-  // we'd round 3 the other way
+  // if it equals 4 we'll set it to adjust by -1 to account for the fact
+  // we'd round it by 3 the other way
   filter1 = signed_char_clamp(filter + 4) >> 3;
   filter2 = signed_char_clamp(filter + 3) >> 3;
 
@@ -308,12 +308,12 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
   }
 }
 
-void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,
-                                 const uint8_t *limit, const uint8_t *thresh) {
+void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
   mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
 }
 
-void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh) {
   mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
 }
@@ -425,8 +425,8 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
   filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
 
   // Save bottom 3 bits so that we round one side +4 and the other +3
-  // if it equals 4 we'll set to adjust by -1 to account for the fact
-  // we'd round 3 the other way.
+  // if it equals 4 we'll set it to adjust by -1 to account for the fact
+  // we'd round it by 3 the other way.
   filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
   filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
 
@@ -673,14 +673,13 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
   }
 }
 
-void vpx_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p,
-                                        const uint8_t *blimit,
-                                        const uint8_t *limit,
-                                        const uint8_t *thresh, int bd) {
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
   highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
 }
 
-void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int p,
                                          const uint8_t *blimit,
                                          const uint8_t *limit,
                                          const uint8_t *thresh, int bd) {
diff --git a/libs/libvpx/vpx_dsp/mips/add_noise_msa.c b/libs/libvpx/vpx_dsp/mips/add_noise_msa.c
index e372b9d8c9..43d2c1146e 100644
--- a/libs/libvpx/vpx_dsp/mips/add_noise_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/add_noise_msa.c
@@ -14,7 +14,14 @@
 void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise,
                              int blackclamp, int whiteclamp, int width,
                              int height, int32_t pitch) {
-  uint32_t i, j;
+  int i, j;
+  v16u8 pos0, pos1, ref0, ref1;
+  v16i8 black_clamp, white_clamp, both_clamp;
+
+  black_clamp = __msa_fill_b(blackclamp);
+  white_clamp = __msa_fill_b(whiteclamp);
+  both_clamp = black_clamp + white_clamp;
+  both_clamp = -both_clamp;
 
   for (i = 0; i < height / 2; ++i) {
     uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
@@ -22,29 +29,16 @@ void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise,
     uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
     const int8_t *ref1_ptr = noise + (rand() & 0xff);
     for (j = width / 16; j--;) {
-      v16i8 temp00_s, temp01_s;
-      v16u8 temp00, temp01, black_clamp, white_clamp;
-      v16u8 pos0, ref0, pos1, ref1;
-      v16i8 const127 = __msa_ldi_b(127);
-
       pos0 = LD_UB(pos0_ptr);
       ref0 = LD_UB(ref0_ptr);
       pos1 = LD_UB(pos1_ptr);
       ref1 = LD_UB(ref1_ptr);
-      black_clamp = (v16u8)__msa_fill_b(blackclamp);
-      white_clamp = (v16u8)__msa_fill_b(whiteclamp);
-      temp00 = (pos0 < black_clamp);
-      pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
-      temp01 = (pos1 < black_clamp);
-      pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
-      XORI_B2_128_UB(pos0, pos1);
-      temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
-      temp00 = (v16u8)(temp00_s < pos0);
-      pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
-      temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
-      temp01 = (temp01_s < pos1);
-      pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
-      XORI_B2_128_UB(pos0, pos1);
+      pos0 = __msa_subsus_u_b(pos0, black_clamp);
+      pos1 = __msa_subsus_u_b(pos1, black_clamp);
+      pos0 = __msa_subsus_u_b(pos0, both_clamp);
+      pos1 = __msa_subsus_u_b(pos1, both_clamp);
+      pos0 = __msa_subsus_u_b(pos0, white_clamp);
+      pos1 = __msa_subsus_u_b(pos1, white_clamp);
       pos0 += ref0;
       ST_UB(pos0, pos0_ptr);
       pos1 += ref1;
diff --git a/libs/libvpx/vpx_dsp/mips/avg_msa.c b/libs/libvpx/vpx_dsp/mips/avg_msa.c
index 52a24ed379..d0ac7b8e29 100644
--- a/libs/libvpx/vpx_dsp/mips/avg_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/avg_msa.c
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <stdlib.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
@@ -54,3 +55,674 @@ uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
 
   return sum_out;
 }
+
+void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride,
+                          int16_t *dst) {
+  v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+              tmp4, tmp5, tmp1, tmp6, tmp2);
+  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+              tmp4, tmp5, tmp1, tmp6, tmp2);
+  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8);
+}
+
+void vpx_hadamard_16x16_msa(const int16_t *src, ptrdiff_t src_stride,
+                            int16_t *dst) {
+  v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+
+  LD_SH2(src, 8, src0, src8);
+  src += src_stride;
+  LD_SH2(src, 8, src1, src9);
+  src += src_stride;
+  LD_SH2(src, 8, src2, src10);
+  src += src_stride;
+  LD_SH2(src, 8, src3, src11);
+  src += src_stride;
+  LD_SH2(src, 8, src4, src12);
+  src += src_stride;
+  LD_SH2(src, 8, src5, src13);
+  src += src_stride;
+  LD_SH2(src, 8, src6, src14);
+  src += src_stride;
+  LD_SH2(src, 8, src7, src15);
+  src += src_stride;
+
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+              tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+              tmp4, tmp5, tmp1, tmp6, tmp2);
+  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+              tmp4, tmp5, tmp1, tmp6, tmp2);
+  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src11, src4, src5, src6, src7);
+  ST_SH8(src0, src1, src2, src11, src4, src5, src6, src7, dst, 8);
+
+  BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+              src12, src13, src15, src14, src11, src10);
+  BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+              tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+  TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
+                     src9, src10, src11, src12, src13, src14, src15);
+  BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+              tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+  BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+              src12, src13, src15, src14, src11, src10);
+  BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+              tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+  TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
+                     res1, res2, res3, res4, res5, res6, res7);
+
+  LD_SH2(src, 8, src0, src8);
+  src += src_stride;
+  LD_SH2(src, 8, src1, src9);
+  src += src_stride;
+  LD_SH2(src, 8, src2, src10);
+  src += src_stride;
+  LD_SH2(src, 8, src3, src11);
+  src += src_stride;
+
+  ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 64, 8);
+
+  LD_SH2(src, 8, src4, src12);
+  src += src_stride;
+  LD_SH2(src, 8, src5, src13);
+  src += src_stride;
+  LD_SH2(src, 8, src6, src14);
+  src += src_stride;
+  LD_SH2(src, 8, src7, src15);
+  src += src_stride;
+
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+              tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+              tmp4, tmp5, tmp1, tmp6, tmp2);
+  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+              tmp4, tmp5, tmp1, tmp6, tmp2);
+  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 2 * 64, 8);
+
+  BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+              src12, src13, src15, src14, src11, src10);
+  BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+              tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+  TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
+                     src9, src10, src11, src12, src13, src14, src15);
+  BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+              tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+  BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+              src12, src13, src15, src14, src11, src10);
+  BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+              tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+  TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
+                     res1, res2, res3, res4, res5, res6, res7);
+  ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 3 * 64, 8);
+
+  LD_SH4(dst, 64, src0, src1, src2, src3);
+  LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+  SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+
+  ST_SH4(src0, src1, src2, src3, dst, 64);
+  ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+  dst += 16;
+
+  LD_SH4(dst, 64, src0, src1, src2, src3);
+  LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+  SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+
+  ST_SH4(src0, src1, src2, src3, dst, 64);
+  ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+  dst += 16;
+
+  LD_SH4(dst, 64, src0, src1, src2, src3);
+  LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+  SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+
+  ST_SH4(src0, src1, src2, src3, dst, 64);
+  ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+  dst += 16;
+
+  LD_SH4(dst, 64, src0, src1, src2, src3);
+  LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+  SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+
+  ST_SH4(src0, src1, src2, src3, dst, 64);
+  ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+}
+
+int vpx_satd_msa(const int16_t *data, int length) {
+  int i, satd;
+  v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8i16 src8, src9, src10, src11, src12, src13, src14, src15;
+  v8i16 zero = { 0 };
+  v8u16 tmp0_h, tmp1_h, tmp2_h, tmp3_h, tmp4_h, tmp5_h, tmp6_h, tmp7_h;
+  v4u32 tmp0_w = { 0 };
+
+  if (16 == length) {
+    LD_SH2(data, 8, src0, src1);
+    tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+    tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+    tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
+    tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+    satd = HADD_UW_U32(tmp0_w);
+  } else if (64 == length) {
+    LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+
+    tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+    tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+    tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
+    tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
+    tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
+    tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
+    tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
+    tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
+
+    tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
+    tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+    tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+    tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+    tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+    tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+    tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+    tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+
+    satd = HADD_UW_U32(tmp0_w);
+  } else if (256 == length) {
+    for (i = 0; i < 2; ++i) {
+      LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+      data += 8 * 8;
+      LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
+      data += 8 * 8;
+
+      tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+      tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+      tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
+      tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
+      tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
+      tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
+      tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
+      tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
+
+      tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+      tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+      tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+      tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+      tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+      tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+      tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+      tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+
+      tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
+      tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
+      tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
+      tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
+      tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
+      tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
+      tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
+      tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
+
+      tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+      tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+      tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+      tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+      tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+      tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+      tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+      tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+    }
+
+    satd = HADD_UW_U32(tmp0_w);
+  } else if (1024 == length) {
+    for (i = 0; i < 8; ++i) {
+      LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+      data += 8 * 8;
+      LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
+      data += 8 * 8;
+
+      tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+      tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+      tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
+      tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
+      tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
+      tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
+      tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
+      tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
+
+      tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+      tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+      tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+      tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+      tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+      tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+      tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+      tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+
+      tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
+      tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
+      tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
+      tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
+      tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
+      tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
+      tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
+      tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
+
+      tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+      tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+      tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+      tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+      tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+      tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+      tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+      tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+    }
+
+    satd = HADD_UW_U32(tmp0_w);
+  } else {
+    satd = 0;
+
+    for (i = 0; i < length; ++i) {
+      satd += abs(data[i]);
+    }
+  }
+
+  return satd;
+}
+
+void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref,
+                         const int ref_stride, const int height) {
+  int i;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v8i16 hbuf_r = { 0 };
+  v8i16 hbuf_l = { 0 };
+  v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l;
+  v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l;
+
+  if (16 == height) {
+    for (i = 2; i--;) {
+      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+      ref += 8 * ref_stride;
+      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+    }
+
+    SRA_2V(hbuf_r, hbuf_l, 3);
+    ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+  } else if (32 == height) {
+    for (i = 2; i--;) {
+      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+      ref += 8 * ref_stride;
+      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+      ref += 8 * ref_stride;
+      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+    }
+
+    SRA_2V(hbuf_r, hbuf_l, 4);
+    ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+  } else if (64 == height) {
+    for (i = 4; i--;) {
+      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+      ref += 8 * ref_stride;
+      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+      ref += 8 * ref_stride;
+      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+    }
+
+    SRA_2V(hbuf_r, hbuf_l, 5);
+    ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+  } else {
+    const int norm_factor = height >> 1;
+    int cnt;
+
+    for (cnt = 0; cnt < 16; cnt++) {
+      hbuf[cnt] = 0;
+    }
+
+    for (i = 0; i < height; ++i) {
+      for (cnt = 0; cnt < 16; cnt++) {
+        hbuf[cnt] += ref[cnt];
+      }
+
+      ref += ref_stride;
+    }
+
+    for (cnt = 0; cnt < 16; cnt++) {
+      hbuf[cnt] /= norm_factor;
+    }
+  }
+}
+
+int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) {
+  int16_t sum;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 ref0_h;
+
+  if (16 == width) {
+    ref0 = LD_UB(ref);
+    ref0_h = __msa_hadd_u_h(ref0, ref0);
+    sum = HADD_UH_U32(ref0_h);
+  } else if (32 == width) {
+    LD_UB2(ref, 16, ref0, ref1);
+    ref0_h = __msa_hadd_u_h(ref0, ref0);
+    ref0_h += __msa_hadd_u_h(ref1, ref1);
+    sum = HADD_UH_U32(ref0_h);
+  } else if (64 == width) {
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref0_h = __msa_hadd_u_h(ref0, ref0);
+    ref0_h += __msa_hadd_u_h(ref1, ref1);
+    ref0_h += __msa_hadd_u_h(ref2, ref2);
+    ref0_h += __msa_hadd_u_h(ref3, ref3);
+    sum = HADD_UH_U32(ref0_h);
+  } else {
+    int idx;
+
+    sum = 0;
+    for (idx = 0; idx < width; ++idx) {
+      sum += ref[idx];
+    }
+  }
+
+  return sum;
+}
+
+int vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl) {
+  int sse, mean, var;
+  v8i16 src0, src1, src2, src3, src4, src5, src6, src7, ref0, ref1, ref2;
+  v8i16 ref3, ref4, ref5, ref6, ref7, src_l0_m, src_l1_m, src_l2_m, src_l3_m;
+  v8i16 src_l4_m, src_l5_m, src_l6_m, src_l7_m;
+  v4i32 res_l0_m, res_l1_m, res_l2_m, res_l3_m, res_l4_m, res_l5_m, res_l6_m;
+  v4i32 res_l7_m, mean_v;
+  v2i64 sse_v;
+
+  if (2 == bwl) {
+    LD_SH2(src, 8, src0, src1);
+    LD_SH2(ref, 8, ref0, ref1);
+
+    ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
+    ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
+    HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+    HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+    sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
+    sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
+    DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+    mean_v = res_l0_m + res_l1_m;
+    mean_v += res_l2_m + res_l3_m;
+
+    sse_v += __msa_splati_d(sse_v, 1);
+    sse = __msa_copy_s_w((v4i32)sse_v, 0);
+
+    mean = HADD_SW_S32(mean_v);
+  } else if (3 == bwl) {
+    LD_SH4(src, 8, src0, src1, src2, src3);
+    LD_SH4(ref, 8, ref0, ref1, ref2, ref3);
+
+    ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
+    ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
+    ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
+    ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
+    HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+    HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+    HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
+    HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
+    sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
+    sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
+    DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+    DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
+    DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
+    mean_v = res_l0_m + res_l1_m;
+    mean_v += res_l2_m + res_l3_m;
+    mean_v += res_l4_m + res_l5_m;
+    mean_v += res_l6_m + res_l7_m;
+
+    sse_v += __msa_splati_d(sse_v, 1);
+    sse = __msa_copy_s_w((v4i32)sse_v, 0);
+
+    mean = HADD_SW_S32(mean_v);
+  } else if (4 == bwl) {
+    LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_SH8(ref, 8, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+
+    ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
+    ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
+    ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
+    ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
+    HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+    HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+    HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
+    HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
+    sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
+    sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
+    DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+    DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
+    DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
+    mean_v = res_l0_m + res_l1_m;
+    mean_v += res_l2_m + res_l3_m;
+    mean_v += res_l4_m + res_l5_m;
+    mean_v += res_l6_m + res_l7_m;
+
+    ILVRL_H2_SH(src4, ref4, src_l0_m, src_l1_m);
+    ILVRL_H2_SH(src5, ref5, src_l2_m, src_l3_m);
+    ILVRL_H2_SH(src6, ref6, src_l4_m, src_l5_m);
+    ILVRL_H2_SH(src7, ref7, src_l6_m, src_l7_m);
+    HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+    HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+    HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
+    HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
+    DPADD_SD2_SD(res_l0_m, res_l1_m, sse_v, sse_v);
+    DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+    DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
+    DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
+    mean_v += res_l0_m + res_l1_m;
+    mean_v += res_l2_m + res_l3_m;
+    mean_v += res_l4_m + res_l5_m;
+    mean_v += res_l6_m + res_l7_m;
+
+    sse_v += __msa_splati_d(sse_v, 1);
+    sse = __msa_copy_s_w((v4i32)sse_v, 0);
+
+    mean = HADD_SW_S32(mean_v);
+  } else {
+    int i;
+    const int width = 4 << bwl;
+
+    sse = 0;
+    mean = 0;
+
+    for (i = 0; i < width; ++i) {
+      const int diff = ref[i] - src[i];
+
+      mean += diff;
+      sse += diff * diff;
+    }
+  }
+
+  var = sse - ((mean * mean) >> (bwl + 2));
+
+  return var;
+}
+
+void vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp,
+                        int *min, int *max) {
+  v16u8 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7;
+  v16u8 diff0, diff1, diff2, diff3, min0, min1, max0, max1;
+
+  LD_UB8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
+  LD_UB8(d, dp, d0, d1, d2, d3, d4, d5, d6, d7);
+  PCKEV_D4_UB(s1, s0, s3, s2, s5, s4, s7, s6, s0, s1, s2, s3);
+  PCKEV_D4_UB(d1, d0, d3, d2, d5, d4, d7, d6, d0, d1, d2, d3);
+
+  diff0 = __msa_asub_u_b(s0, d0);
+  diff1 = __msa_asub_u_b(s1, d1);
+  diff2 = __msa_asub_u_b(s2, d2);
+  diff3 = __msa_asub_u_b(s3, d3);
+
+  min0 = __msa_min_u_b(diff0, diff1);
+  min1 = __msa_min_u_b(diff2, diff3);
+  min0 = __msa_min_u_b(min0, min1);
+
+  max0 = __msa_max_u_b(diff0, diff1);
+  max1 = __msa_max_u_b(diff2, diff3);
+  max0 = __msa_max_u_b(max0, max1);
+
+  min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 8);
+  min0 = __msa_min_u_b(min0, min1);
+  max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 8);
+  max0 = __msa_max_u_b(max0, max1);
+
+  min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 4);
+  min0 = __msa_min_u_b(min0, min1);
+  max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 4);
+  max0 = __msa_max_u_b(max0, max1);
+
+  min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 2);
+  min0 = __msa_min_u_b(min0, min1);
+  max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 2);
+  max0 = __msa_max_u_b(max0, max1);
+
+  min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 1);
+  min0 = __msa_min_u_b(min0, min1);
+  max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 1);
+  max0 = __msa_max_u_b(max0, max1);
+
+  *min = min0[0];
+  *max = max0[0];
+}
diff --git a/libs/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
index ae88eddfd6..18e7d5375d 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
@@ -219,9 +219,10 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
 
 void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
+                                  const InterpKernel *filter, int x0_q4,
+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   uint32_t pos = 38;
 
   assert(y_step_q4 == 16);
@@ -247,8 +248,8 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                     h);
       break;
     default:
-      vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                               x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                               x_step_q4, y0_q4, y_step_q4, w, h);
       break;
   }
 }
diff --git a/libs/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
index e944207b6e..7dcb662d7f 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
@@ -751,9 +751,10 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
 
 void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   uint32_t pos = 38;
 
   assert(x_step_q4 == 16);
@@ -793,8 +794,8 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                      h);
       break;
     default:
-      vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                                x_step_q4, y0_q4, y_step_q4, w, h);
       break;
   }
 }
diff --git a/libs/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
index 5cc06b5f26..9e65a8f50f 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
@@ -628,9 +628,10 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
 
 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h) {
+                               const InterpKernel *filter, int x0_q4,
+                               int32_t x_step_q4, int y0_q4, int y_step_q4,
+                               int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   uint32_t pos = 38;
 
   assert(x_step_q4 == 16);
@@ -672,8 +673,8 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                  (int32_t)dst_stride, filter_x, (int32_t)h);
       break;
     default:
-      vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
       break;
   }
 }
diff --git a/libs/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
index eb1975e447..a3e967b405 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
@@ -201,9 +201,10 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
+                              const InterpKernel *filter, int x0_q4,
+                              int32_t x_step_q4, int y0_q4, int y_step_q4,
+                              int w, int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   uint32_t pos = 38;
 
   assert(y_step_q4 == 16);
@@ -228,8 +229,8 @@ void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
       convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
       break;
     default:
-      vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h);
       break;
   }
 }
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
index 31812299c3..d9c2bef69e 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -334,15 +334,16 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
+                                  const InterpKernel *filter, int x0_q4,
+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   assert(y_step_q4 == 16);
   assert(((const int32_t *)filter_y)[1] != 0x800000);
 
   if (((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+    vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
@@ -367,8 +368,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    h);
         break;
       default:
-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
@@ -376,8 +377,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 
 void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
+                             const InterpKernel *filter, int x0_q4,
+                             int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
                              int h) {
   /* Fixed size intermediate buffer places limits on parameters. */
   DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
@@ -390,21 +391,26 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 
   if (intermediate_height < h) intermediate_height = h;
 
-  vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x,
-                      x_step_q4, filter_y, y_step_q4, w, intermediate_height);
+  vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter,
+                      x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                      intermediate_height);
 
-  vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x,
-                         x_step_q4, filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4,
+                         x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int filter_x_stride,
-                            const int16_t *filter_y, int filter_y_stride, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
   int x, y;
-  uint32_t tp1, tp2, tn1;
-  uint32_t tp3, tp4, tn2;
+  uint32_t tp1, tp2, tn1, tp3, tp4, tn2;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   /* prefetch data to cache memory */
   prefetch_load(src);
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
index 9a9bab25a5..fb68ad8813 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
@@ -938,15 +938,16 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
 
 void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   assert(x_step_q4 == 16);
   assert(((const int32_t *)filter_x)[1] != 0x800000);
 
   if (((const int32_t *)filter_x)[0] == 0) {
-    vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
+    vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
@@ -987,9 +988,8 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                     h);
         break;
       default:
-        vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
-                                  h);
+        vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c
index f6812c7d04..89f0f41962 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c
@@ -1296,9 +1296,11 @@ void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
 }
 
 void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4,
                          int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
   DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
   uint32_t pos = 38;
@@ -1307,6 +1309,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   assert(y_step_q4 == 16);
   assert(((const int32_t *)filter_x)[1] != 0x800000);
   assert(((const int32_t *)filter_y)[1] != 0x800000);
+  (void)x_step_q4;
 
   /* bit positon for extract from acc */
   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
@@ -1394,10 +1397,15 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 
 void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int filter_x_stride,
-                             const int16_t *filter_y, int filter_y_stride,
-                             int w, int h) {
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
   int x, y;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   /* prefetch data to cache memory */
   prefetch_load(src);
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
index 196a0a2f0b..77e95c8444 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
@@ -818,15 +818,16 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
 
 void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
                                int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   assert(x_step_q4 == 16);
   assert(((const int32_t *)filter_x)[1] != 0x800000);
 
   if (((const int32_t *)filter_x)[0] == 0) {
-    vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+    vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
@@ -868,8 +869,8 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                 (int32_t)dst_stride, filter_x, (int32_t)h);
         break;
       default:
-        vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
+                              x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
index ad107d5c47..c329f71ccf 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
@@ -318,15 +318,16 @@ static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
                               int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   assert(y_step_q4 == 16);
   assert(((const int32_t *)filter_y)[1] != 0x800000);
 
   if (((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+    vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
@@ -349,8 +350,8 @@ void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
         convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
         break;
       default:
-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h b/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
index 4eee3bd5e1..48e440d73c 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
+++ b/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
@@ -24,21 +24,21 @@ extern "C" {
 #if HAVE_DSPR2
 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h);
+                               const InterpKernel *filter, int x0_q4,
+                               int32_t x_step_q4, int y0_q4, int y_step_q4,
+                               int w, int h);
 
 void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h);
 
 void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h);
+                                  const InterpKernel *filter, int x0_q4,
+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h);
 
 void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                          ptrdiff_t dst_stride, const int16_t *filter, int w,
@@ -46,9 +46,9 @@ void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 
 void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h);
+                              const InterpKernel *filter, int x0_q4,
+                              int32_t x_step_q4, int y0_q4, int y_step_q4,
+                              int w, int h);
 
 #endif  // #if HAVE_DSPR2
 #ifdef __cplusplus
diff --git a/libs/libvpx/vpx_dsp/mips/deblock_msa.c b/libs/libvpx/vpx_dsp/mips/deblock_msa.c
index 402d7ed997..aafa272fbd 100644
--- a/libs/libvpx/vpx_dsp/mips/deblock_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/deblock_msa.c
@@ -9,6 +9,7 @@
  */
 
 #include <stdlib.h>
+
 #include "./macros_msa.h"
 
 extern const int16_t vpx_rv[];
@@ -295,6 +296,7 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
   uint8_t *p_dst_st = dst_ptr;
   uint8_t *f_orig = f;
   uint16_t col;
+  uint64_t out0, out1, out2, out3;
   v16u8 above2, above1, below2, below1;
   v16u8 src, ref, ref_temp;
   v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
@@ -346,6 +348,67 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
     f += 16;
   }
 
+  if (0 != (cols / 16)) {
+    ref = LD_UB(f);
+    LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+    src = LD_UB(p_src);
+    LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+    above2 = LD_UB(p_src + 3 * src_stride);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+    above1 = LD_UB(p_src + 4 * src_stride);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+    src = LD_UB(p_src + 5 * src_stride);
+    VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+    below1 = LD_UB(p_src + 6 * src_stride);
+    VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+    below2 = LD_UB(p_src + 7 * src_stride);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+    above2 = LD_UB(p_src + 8 * src_stride);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+    above1 = LD_UB(p_src + 9 * src_stride);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+    src = LD_UB(p_src + 10 * src_stride);
+    VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
+    below1 = LD_UB(p_src + 11 * src_stride);
+    VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
+    below2 = LD_UB(p_src + 12 * src_stride);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
+    above2 = LD_UB(p_src + 13 * src_stride);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
+    above1 = LD_UB(p_src + 14 * src_stride);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
+    src = LD_UB(p_src + 15 * src_stride);
+    VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
+    below1 = LD_UB(p_src + 16 * src_stride);
+    VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
+    below2 = LD_UB(p_src + 17 * src_stride);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
+    out0 = __msa_copy_u_d((v2i64)inter0, 0);
+    out1 = __msa_copy_u_d((v2i64)inter1, 0);
+    out2 = __msa_copy_u_d((v2i64)inter2, 0);
+    out3 = __msa_copy_u_d((v2i64)inter3, 0);
+    SD4(out0, out1, out2, out3, p_dst, dst_stride);
+
+    out0 = __msa_copy_u_d((v2i64)inter4, 0);
+    out1 = __msa_copy_u_d((v2i64)inter5, 0);
+    out2 = __msa_copy_u_d((v2i64)inter6, 0);
+    out3 = __msa_copy_u_d((v2i64)inter7, 0);
+    SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
+
+    out0 = __msa_copy_u_d((v2i64)inter8, 0);
+    out1 = __msa_copy_u_d((v2i64)inter9, 0);
+    out2 = __msa_copy_u_d((v2i64)inter10, 0);
+    out3 = __msa_copy_u_d((v2i64)inter11, 0);
+    SD4(out0, out1, out2, out3, p_dst + 8 * dst_stride, dst_stride);
+
+    out0 = __msa_copy_u_d((v2i64)inter12, 0);
+    out1 = __msa_copy_u_d((v2i64)inter13, 0);
+    out2 = __msa_copy_u_d((v2i64)inter14, 0);
+    out3 = __msa_copy_u_d((v2i64)inter15, 0);
+    SD4(out0, out1, out2, out3, p_dst + 12 * dst_stride, dst_stride);
+  }
+
   f = f_orig;
   p_dst = dst_ptr - 2;
   LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
@@ -454,12 +517,12 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
   v16u8 tmp = { 0 };
   v16i8 zero = { 0 };
   v8u16 sum_h, src_r_h, src_l_h;
-  v4u32 src_r_w, src_l_w;
+  v4u32 src_r_w;
   v4i32 flimit_vec;
 
   flimit_vec = __msa_fill_w(flimit);
   for (row = rows; row--;) {
-    int32_t sum_sq = 0;
+    int32_t sum_sq;
     int32_t sum = 0;
     src0 = (v16u8)__msa_fill_b(src_dup[0]);
     ST8x1_UB(src0, (src_dup - 8));
@@ -473,9 +536,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
     src[15] = 0;
     ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
     src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
-    src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
-    sum_sq = HADD_SW_S32(src_r_w);
-    sum_sq += HADD_SW_S32(src_l_w);
+    src_r_w += __msa_dotp_u_w(src_l_h, src_l_h);
+    sum_sq = HADD_SW_S32(src_r_w) + 16;
     sum_h = __msa_hadd_u_h(src, src);
     sum = HADD_UH_U32(sum_h);
     {
@@ -574,7 +636,6 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
 void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
                               int32_t cols, int32_t flimit) {
   int32_t row, col, cnt, i;
-  const int16_t *rv3 = &vpx_rv[63 & rand()];
   v4i32 flimit_vec;
   v16u8 dst7, dst8, dst_r_b, dst_l_b;
   v16i8 mask;
@@ -602,7 +663,7 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
 
     dst = LD_UB(dst_tmp);
     for (cnt = (col << 4), i = 0; i < 16; ++cnt) {
-      rv2[i] = rv3 + ((cnt * 17) & 127);
+      rv2[i] = vpx_rv + (i & 7);
       ++i;
     }
     for (cnt = -8; cnt < 0; ++cnt) {
diff --git a/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c b/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c
index e41a904808..06fdc951e7 100644
--- a/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c
@@ -927,21 +927,21 @@ void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
 }
 
 void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
-  int sum = LD_HADD(input, stride);
-  sum += LD_HADD(input + 8, stride);
-  sum += LD_HADD(input + 16, stride);
-  sum += LD_HADD(input + 24, stride);
-  sum += LD_HADD(input + 32 * 8, stride);
-  sum += LD_HADD(input + 32 * 8 + 8, stride);
-  sum += LD_HADD(input + 32 * 8 + 16, stride);
-  sum += LD_HADD(input + 32 * 8 + 24, stride);
-  sum += LD_HADD(input + 32 * 16, stride);
-  sum += LD_HADD(input + 32 * 16 + 8, stride);
-  sum += LD_HADD(input + 32 * 16 + 16, stride);
-  sum += LD_HADD(input + 32 * 16 + 24, stride);
-  sum += LD_HADD(input + 32 * 24, stride);
-  sum += LD_HADD(input + 32 * 24 + 8, stride);
-  sum += LD_HADD(input + 32 * 24 + 16, stride);
-  sum += LD_HADD(input + 32 * 24 + 24, stride);
+  int sum, i;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v4i32 vec_w = { 0 };
+
+  for (i = 0; i < 16; ++i) {
+    LD_SH4(input, 8, in0, in1, in2, in3);
+    input += stride;
+    LD_SH4(input, 8, in4, in5, in6, in7);
+    input += stride;
+    ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
+    ADD2(in0, in2, in4, in6, in0, in4);
+    vec_w += __msa_hadd_s_w(in0, in0);
+    vec_w += __msa_hadd_s_w(in4, in4);
+  }
+
+  sum = HADD_SW_S32(vec_w);
   out[0] = (int16_t)(sum >> 3);
 }
diff --git a/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.c b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
index fdead50503..5a6dfcef2f 100644
--- a/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
@@ -8,8 +8,23 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/fwd_txfm_msa.h"
 
+void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *out, int32_t stride) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v4i32 vec_w;
+
+  LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+  ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
+  ADD2(in0, in2, in4, in6, in0, in4);
+  vec_w = __msa_hadd_s_w(in0, in0);
+  vec_w += __msa_hadd_s_w(in4, in4);
+  out[0] = HADD_SW_S32(vec_w);
+  out[1] = 0;
+}
+
+#if !CONFIG_VP9_HIGHBITDEPTH
 void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
                         int32_t src_stride) {
   v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
@@ -215,11 +230,6 @@ void vpx_fdct8x8_msa(const int16_t *input, int16_t *output,
   ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
 }
 
-void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
-  out[0] = LD_HADD(input, stride);
-  out[1] = 0;
-}
-
 void vpx_fdct16x16_msa(const int16_t *input, int16_t *output,
                        int32_t src_stride) {
   int32_t i;
@@ -237,9 +247,26 @@ void vpx_fdct16x16_msa(const int16_t *input, int16_t *output,
 }
 
 void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
-  int sum = LD_HADD(input, stride);
-  sum += LD_HADD(input + 8, stride);
-  sum += LD_HADD(input + 16 * 8, stride);
-  sum += LD_HADD(input + 16 * 8 + 8, stride);
+  int sum, i;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v4i32 vec_w = { 0 };
+
+  for (i = 0; i < 4; ++i) {
+    LD_SH2(input, 8, in0, in1);
+    input += stride;
+    LD_SH2(input, 8, in2, in3);
+    input += stride;
+    LD_SH2(input, 8, in4, in5);
+    input += stride;
+    LD_SH2(input, 8, in6, in7);
+    input += stride;
+    ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
+    ADD2(in0, in2, in4, in6, in0, in4);
+    vec_w += __msa_hadd_s_w(in0, in0);
+    vec_w += __msa_hadd_s_w(in4, in4);
+  }
+
+  sum = HADD_SW_S32(vec_w);
   out[0] = (int16_t)(sum >> 1);
 }
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h
index db5e90e7b9..fd589224d3 100644
--- a/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h
+++ b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h
@@ -14,22 +14,6 @@
 #include "vpx_dsp/mips/txfm_macros_msa.h"
 #include "vpx_dsp/txfm_common.h"
 
-#define LD_HADD(psrc, stride)                                                  \
-  ({                                                                           \
-    v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m;              \
-    v4i32 vec_w_m;                                                             \
-                                                                               \
-    LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m);                        \
-    ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m);                            \
-    LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m);         \
-    ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \
-         in0_m, in4_m);                                                        \
-    in0_m += in4_m;                                                            \
-                                                                               \
-    vec_w_m = __msa_hadd_s_w(in0_m, in0_m);                                    \
-    HADD_SW_S32(vec_w_m);                                                      \
-  })
-
 #define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3)                  \
   {                                                                            \
     v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m;                                  \
diff --git a/libs/libvpx/vpx_dsp/mips/intrapred16_dspr2.c b/libs/libvpx/vpx_dsp/mips/intrapred16_dspr2.c
index 3e29d0ac39..835e10e125 100644
--- a/libs/libvpx/vpx_dsp/mips/intrapred16_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/intrapred16_dspr2.c
@@ -15,6 +15,7 @@ void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
   int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  (void)above;
 
   __asm__ __volatile__(
       "lb         %[tmp1],      (%[left])                    \n\t"
diff --git a/libs/libvpx/vpx_dsp/mips/intrapred4_dspr2.c b/libs/libvpx/vpx_dsp/mips/intrapred4_dspr2.c
index 9f51d50c75..dce03a2b2a 100644
--- a/libs/libvpx/vpx_dsp/mips/intrapred4_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/intrapred4_dspr2.c
@@ -14,6 +14,7 @@
 void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   int32_t tmp1, tmp2, tmp3, tmp4;
+  (void)above;
 
   __asm__ __volatile__(
       "lb         %[tmp1],      (%[left])                    \n\t"
diff --git a/libs/libvpx/vpx_dsp/mips/intrapred8_dspr2.c b/libs/libvpx/vpx_dsp/mips/intrapred8_dspr2.c
index eac79d5100..16e7fc5507 100644
--- a/libs/libvpx/vpx_dsp/mips/intrapred8_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/intrapred8_dspr2.c
@@ -14,6 +14,7 @@
 void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  (void)above;
 
   __asm__ __volatile__(
       "lb         %[tmp1],      (%[left])                   \n\t"
diff --git a/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h b/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
index edd54aec5e..27881f0db6 100644
--- a/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
+++ b/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
@@ -57,18 +57,15 @@ extern "C" {
     out;                                                                       \
   })
 
-void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                   int dest_stride);
+void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
 void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output);
-void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                     int dest_stride);
+void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
 void iadst4_dspr2(const int16_t *input, int16_t *output);
 void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
-void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                 int dest_stride);
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
 void iadst8_dspr2(const int16_t *input, int16_t *output);
 void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride);
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride);
 void iadst16_dspr2(const int16_t *input, int16_t *output);
 
 #endif  // #if HAVE_DSPR2
diff --git a/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h b/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h
index 8b8a4173d2..1fe9b28e8a 100644
--- a/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h
+++ b/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h
@@ -196,18 +196,18 @@
                  out2, out3)                                                  \
   {                                                                           \
     v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                         \
-    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
+    v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd;                         \
                                                                               \
     ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                            \
     ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                            \
     DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \
-                cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
-    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
-    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);                  \
+                cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd);            \
+    SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS);  \
+    PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1);      \
     DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \
-                cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
-    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
-    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3);                  \
+                cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd);            \
+    SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS);  \
+    PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3);      \
   }
 
 /* idct 8x8 macro */
diff --git a/libs/libvpx/vpx_dsp/mips/itrans16_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans16_dspr2.c
index 0ec0c2059f..44ba65c7ac 100644
--- a/libs/libvpx/vpx_dsp/mips/itrans16_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/itrans16_dspr2.c
@@ -389,7 +389,7 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
   }
 }
 
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
   int i;
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int step1_8, step1_9, step1_10, step1_11;
@@ -712,14 +712,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
         "add      %[load6],         %[load6],           %[step1_14]     \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
         "add      %[load8],         %[load8],           %[load6]        \n\t"
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
         "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
@@ -731,14 +731,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
         "add      %[load6],         %[load6],           %[step1_12]     \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
         "add      %[load8],         %[load8],           %[load6]        \n\t"
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
         "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
@@ -750,14 +750,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
         "add      %[load6],         %[load6],           %[step1_10]     \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
         "add      %[load8],         %[load8],           %[load6]        \n\t"
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
@@ -769,14 +769,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
         "add      %[load6],         %[load6],           %[step1_8]      \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
         "add      %[load8],         %[load8],           %[load6]        \n\t"
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
         "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
@@ -788,14 +788,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
         "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
         "add      %[load8],         %[load8],           %[load6]        \n\t"
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
         "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
@@ -807,14 +807,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
         "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
         "add      %[load8],         %[load8],           %[load6]        \n\t"
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
         "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
@@ -826,14 +826,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
         "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
         "add      %[load8],         %[load8],           %[load6]        \n\t"
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
         "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
@@ -845,7 +845,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
         "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
         "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[load8],         0(%[dest_pix])                      \n\t"
         "addi     %[load6],         %[load6],           32              \n\t"
         "sra      %[load6],         %[load6],           6               \n\t"
@@ -856,7 +856,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
         : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
           [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
         :
-        [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
+        [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
         [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
         [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
         [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
@@ -869,7 +869,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
 }
 
 void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
+                                 int stride) {
   DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
   uint32_t pos = 45;
 
@@ -880,11 +880,11 @@ void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
   idct16_rows_dspr2(input, out, 16);
 
   // Then transform columns and add to dest
-  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+  idct16_cols_add_blk_dspr2(out, dest, stride);
 }
 
 void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
-                                int dest_stride) {
+                                int stride) {
   DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
   int16_t *outptr = out;
   uint32_t i;
@@ -924,11 +924,11 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
   }
 
   // Then transform columns
-  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
+  idct16_cols_add_blk_dspr2(out, dest, stride);
 }
 
 void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                               int dest_stride) {
+                               int stride) {
   uint32_t pos = 45;
   int32_t out;
   int32_t r;
@@ -975,13 +975,54 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "sw             %[vector_2],    4(%[dest])                      \n\t"
           "sw             %[vector_3],    8(%[dest])                      \n\t"
           "sw             %[vector_4],    12(%[dest])                     \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
 
           : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
             [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
             [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
             [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+    }
+  } else if (a1 > 255) {
+    int32_t a11, a12, vector_a11, vector_a12;
+
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    a11 = a1 >> 1;
+    a12 = a1 - a11;
+    __asm__ __volatile__(
+        "replv.qb       %[vector_a11],  %[a11]     \n\t"
+        "replv.qb       %[vector_a12],  %[a12]     \n\t"
+
+        : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+        : [a11] "r"(a11), [a12] "r"(a12));
+
+    for (r = 16; r--;) {
+      __asm__ __volatile__(
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_3],    %[vector_3],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_4],    %[vector_4],    %[vector_a12]   \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+            [dest] "+&r"(dest)
+          : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+            [vector_a12] "r"(vector_a12));
     }
   } else {
     /* use quad-byte
@@ -1005,13 +1046,13 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "sw             %[vector_2],    4(%[dest])                      \n\t"
           "sw             %[vector_3],    8(%[dest])                      \n\t"
           "sw             %[vector_4],    12(%[dest])                     \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
 
           : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
             [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
             [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
             [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
     }
   }
 }
diff --git a/libs/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
index ce25d55c9c..3f043b48ba 100644
--- a/libs/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c
@@ -13,26 +13,25 @@
 #include "vpx_dsp/txfm_common.h"
 
 #if HAVE_DSPR2
-void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                   int dest_stride) {
-  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
-  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
-  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
-  int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
-  int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
-  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
-  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
-  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
-  int16_t step2_28, step2_29, step2_30, step2_31;
-  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
-  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
-  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
-  int16_t step3_28, step3_29, step3_30, step3_31;
+void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+  int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+  int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+  int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+  int step1_28, step1_29, step1_30, step1_31;
+  int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+  int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+  int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+  int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+  int step2_28, step2_29, step2_30, step2_31;
+  int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+  int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+  int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+  int step3_29, step3_30, step3_31;
   int temp0, temp1, temp2, temp3;
   int load1, load2, load3, load4;
   int result1, result2;
-  int i, temp21;
+  int i;
   uint8_t *dest_pix, *dest_pix1;
   const int const_2_power_13 = 8192;
   uint8_t *cm = vpx_ff_cropTbl;
@@ -49,7 +48,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
   for (i = 0; i < 32; ++i) {
     dest_pix = dest + i;
-    dest_pix1 = dest + i + 31 * dest_stride;
+    dest_pix1 = dest + i + 31 * stride;
 
     __asm__ __volatile__(
         "lh       %[load1],             2(%[input])                     \n\t"
@@ -103,9 +102,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
-          [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
-          [step1_31] "=r"(step1_31)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17),
+          [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
           [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
@@ -163,9 +162,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
-          [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
-          [step1_29] "=r"(step1_29)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19),
+          [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
           [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
@@ -223,9 +222,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
-          [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
-          [step1_27] "=r"(step1_27)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21),
+          [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
           [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
@@ -279,9 +278,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
-          [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
-          [step1_25] "=r"(step1_25)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23),
+          [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
           [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
@@ -335,9 +334,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
-          [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
-          [step2_15] "=r"(step2_15)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8),
+          [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14),
+          [step2_15] "=&r"(step2_15)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
           [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
@@ -391,9 +390,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
-          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
-          [step2_13] "=r"(step2_13)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11),
+          [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
           [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
@@ -434,116 +433,154 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "extp     %[step3_11],          $ac2,           31              \n\t"
         "extp     %[step3_12],          $ac3,           31              \n\t"
 
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
-          [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
-          [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
-          [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
-          [step3_15] "=r"(step3_15)
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8),
+          [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10),
+          [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12),
+          [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14),
+          [step3_15] "=&r"(step3_15)
         : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
           [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
           [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
           [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
           [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
 
-    step2_18 = step1_17 - step1_18;
-    step2_29 = step1_30 - step1_29;
-
     __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_18],          $ac0,           31              \n\t"
-
-        : [step3_18] "=r"(step3_18)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
-          [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
-    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_19 = step1_16 - step1_19;
-    step2_28 = step1_31 - step1_28;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_19],          $ac0,           31              \n\t"
-
-        : [step3_19] "=r"(step3_19)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
-          [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
-    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step3_16 = step1_16 + step1_19;
-    step3_17 = step1_17 + step1_18;
-    step3_30 = step1_29 + step1_30;
-    step3_31 = step1_28 + step1_31;
-
-    step2_20 = step1_23 - step1_20;
-    step2_27 = step1_24 - step1_27;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_20],          $ac0,           31              \n\t"
-
-        : [step3_20] "=r"(step3_20)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
-    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_21 = step1_22 - step1_21;
-    step2_26 = step1_25 - step1_26;
-
-    __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
         "mthi     $zero,                $ac1                            \n\t"
-        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_21],          $ac1,           31              \n\t"
+        "sub      %[temp0],             %[step1_17],    %[step1_18]     \n\t"
+        "sub      %[temp1],             %[step1_30],    %[step1_29]     \n\t"
+        "add      %[step3_17],          %[step1_17],    %[step1_18]     \n\t"
+        "add      %[step3_30],          %[step1_30],    %[step1_29]     \n\t"
 
-        : [step3_21] "=r"(step3_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
-          [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
+        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_18],          $ac0,           31              \n\t"
+        "madd     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_29],          $ac1,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29),
+          [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17),
+          [step1_18] "r"(step1_18), [step1_30] "r"(step1_30),
+          [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64),
           [cospi_8_64] "r"(cospi_8_64));
 
-    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
-    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+    __asm__ __volatile__(
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp0],             %[step1_16],    %[step1_19]     \n\t"
+        "sub      %[temp1],             %[step1_31],    %[step1_28]     \n\t"
+        "add      %[step3_16],          %[step1_16],    %[step1_19]     \n\t"
+        "add      %[step3_31],          %[step1_31],    %[step1_28]     \n\t"
 
-    step3_22 = step1_21 + step1_22;
-    step3_23 = step1_20 + step1_23;
-    step3_24 = step1_24 + step1_27;
-    step3_25 = step1_25 + step1_26;
+        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_19],          $ac0,           31              \n\t"
+        "madd     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_28],          $ac1,           31              \n\t"
 
-    step2_16 = step3_16 + step3_23;
-    step2_17 = step3_17 + step3_22;
-    step2_18 = step3_18 + step3_21;
-    step2_19 = step3_19 + step3_20;
-    step2_20 = step3_19 - step3_20;
-    step2_21 = step3_18 - step3_21;
-    step2_22 = step3_17 - step3_22;
-    step2_23 = step3_16 - step3_23;
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31),
+          [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16),
+          [step1_19] "r"(step1_19), [step1_31] "r"(step1_31),
+          [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64));
 
-    step2_24 = step3_31 - step3_24;
-    step2_25 = step3_30 - step3_25;
-    step2_26 = step3_29 - step3_26;
-    step2_27 = step3_28 - step3_27;
-    step2_28 = step3_28 + step3_27;
-    step2_29 = step3_29 + step3_26;
-    step2_30 = step3_30 + step3_25;
-    step2_31 = step3_31 + step3_24;
+    __asm__ __volatile__(
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp0],             %[step1_23],    %[step1_20]     \n\t"
+        "sub      %[temp1],             %[step1_24],    %[step1_27]     \n\t"
+        "add      %[step3_23],          %[step1_23],    %[step1_20]     \n\t"
+        "add      %[step3_24],          %[step1_24],    %[step1_27]     \n\t"
+
+        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_27],          $ac0,           31              \n\t"
+        "msub     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_20],          $ac1,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24),
+          [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23),
+          [step1_20] "r"(step1_20), [step1_24] "r"(step1_24),
+          [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64));
+
+    __asm__ __volatile__(
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp0],             %[step1_22],    %[step1_21]     \n\t"
+        "sub      %[temp1],             %[step1_25],    %[step1_26]     \n\t"
+        "add      %[step3_22],          %[step1_22],    %[step1_21]     \n\t"
+        "add      %[step3_25],          %[step1_25],    %[step1_26]     \n\t"
+
+        "msub     $ac0,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_21],          $ac0,           31              \n\t"
+        "msub     $ac1,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_26],          $ac1,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25),
+          [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22),
+          [step1_21] "r"(step1_21), [step1_25] "r"(step1_25),
+          [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64));
+
+    __asm__ __volatile__(
+        "add      %[step2_16],          %[step3_16],    %[step3_23]     \n\t"
+        "add      %[step2_17],          %[step3_17],    %[step3_22]     \n\t"
+        "add      %[step2_18],          %[step3_18],    %[step3_21]     \n\t"
+        "add      %[step2_19],          %[step3_19],    %[step3_20]     \n\t"
+        "sub      %[step2_20],          %[step3_19],    %[step3_20]     \n\t"
+        "sub      %[step2_21],          %[step3_18],    %[step3_21]     \n\t"
+        "sub      %[step2_22],          %[step3_17],    %[step3_22]     \n\t"
+        "sub      %[step2_23],          %[step3_16],    %[step3_23]     \n\t"
+
+        : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17),
+          [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19),
+          [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21),
+          [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23)
+        : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23),
+          [step3_17] "r"(step3_17), [step3_22] "r"(step3_22),
+          [step3_18] "r"(step3_18), [step3_21] "r"(step3_21),
+          [step3_19] "r"(step3_19), [step3_20] "r"(step3_20));
+
+    __asm__ __volatile__(
+        "sub      %[step2_24],          %[step3_31],    %[step3_24]     \n\t"
+        "sub      %[step2_25],          %[step3_30],    %[step3_25]     \n\t"
+        "sub      %[step2_26],          %[step3_29],    %[step3_26]     \n\t"
+        "sub      %[step2_27],          %[step3_28],    %[step3_27]     \n\t"
+        "add      %[step2_28],          %[step3_28],    %[step3_27]     \n\t"
+        "add      %[step2_29],          %[step3_29],    %[step3_26]     \n\t"
+        "add      %[step2_30],          %[step3_30],    %[step3_25]     \n\t"
+        "add      %[step2_31],          %[step3_31],    %[step3_24]     \n\t"
+
+        : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28),
+          [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29),
+          [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30),
+          [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31)
+        : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24),
+          [step3_30] "r"(step3_30), [step3_25] "r"(step3_25),
+          [step3_29] "r"(step3_29), [step3_26] "r"(step3_26),
+          [step3_28] "r"(step3_28), [step3_27] "r"(step3_27));
 
     __asm__ __volatile__(
         "lh       %[load1],             0(%[input])                     \n\t"
@@ -580,9 +617,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [result1] "=&r"(result1),
           [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
-          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
-          [step1_3] "=r"(step1_3)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0),
+          [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2),
+          [step1_3] "=&r"(step1_3)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
           [cospi_16_64] "r"(cospi_16_64));
@@ -638,96 +675,137 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
-          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
-          [step1_7] "=r"(step1_7)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4),
+          [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6),
+          [step1_7] "=&r"(step1_7)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
           [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
           [cospi_16_64] "r"(cospi_16_64));
 
-    step2_0 = step1_0 + step1_7;
-    step2_1 = step1_1 + step1_6;
-    step2_2 = step1_2 + step1_5;
-    step2_3 = step1_3 + step1_4;
-    step2_4 = step1_3 - step1_4;
-    step2_5 = step1_2 - step1_5;
-    step2_6 = step1_1 - step1_6;
-    step2_7 = step1_0 - step1_7;
+    __asm__ __volatile__(
+        "add      %[step2_0],          %[step1_0],    %[step1_7]     \n\t"
+        "add      %[step2_1],          %[step1_1],    %[step1_6]     \n\t"
+        "add      %[step2_2],          %[step1_2],    %[step1_5]     \n\t"
+        "add      %[step2_3],          %[step1_3],    %[step1_4]     \n\t"
+        "sub      %[step2_4],          %[step1_3],    %[step1_4]     \n\t"
+        "sub      %[step2_5],          %[step1_2],    %[step1_5]     \n\t"
+        "sub      %[step2_6],          %[step1_1],    %[step1_6]     \n\t"
+        "sub      %[step2_7],          %[step1_0],    %[step1_7]     \n\t"
+
+        : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4),
+          [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5),
+          [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6),
+          [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7)
+        : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7),
+          [step1_1] "r"(step1_1), [step1_6] "r"(step1_6),
+          [step1_2] "r"(step1_2), [step1_5] "r"(step1_5),
+          [step1_3] "r"(step1_3), [step1_4] "r"(step1_4));
 
     // stage 7
-    step1_0 = step2_0 + step3_15;
-    step1_1 = step2_1 + step3_14;
-    step1_2 = step2_2 + step3_13;
-    step1_3 = step2_3 + step3_12;
-    step1_4 = step2_4 + step3_11;
-    step1_5 = step2_5 + step3_10;
-    step1_6 = step2_6 + step3_9;
-    step1_7 = step2_7 + step3_8;
-    step1_8 = step2_7 - step3_8;
-    step1_9 = step2_6 - step3_9;
-    step1_10 = step2_5 - step3_10;
-    step1_11 = step2_4 - step3_11;
-    step1_12 = step2_3 - step3_12;
-    step1_13 = step2_2 - step3_13;
-    step1_14 = step2_1 - step3_14;
-    step1_15 = step2_0 - step3_15;
+    __asm__ __volatile__(
+        "add      %[step1_0],          %[step2_0],    %[step3_15]     \n\t"
+        "add      %[step1_1],          %[step2_1],    %[step3_14]     \n\t"
+        "add      %[step1_2],          %[step2_2],    %[step3_13]     \n\t"
+        "add      %[step1_3],          %[step2_3],    %[step3_12]     \n\t"
+        "sub      %[step1_12],         %[step2_3],    %[step3_12]     \n\t"
+        "sub      %[step1_13],         %[step2_2],    %[step3_13]     \n\t"
+        "sub      %[step1_14],         %[step2_1],    %[step3_14]     \n\t"
+        "sub      %[step1_15],         %[step2_0],    %[step3_15]     \n\t"
+
+        : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12),
+          [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13),
+          [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14),
+          [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15)
+        : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15),
+          [step2_1] "r"(step2_1), [step3_14] "r"(step3_14),
+          [step2_2] "r"(step2_2), [step3_13] "r"(step3_13),
+          [step2_3] "r"(step2_3), [step3_12] "r"(step3_12));
+
+    __asm__ __volatile__(
+        "add      %[step1_4],          %[step2_4],    %[step3_11]     \n\t"
+        "add      %[step1_5],          %[step2_5],    %[step3_10]     \n\t"
+        "add      %[step1_6],          %[step2_6],    %[step3_9]      \n\t"
+        "add      %[step1_7],          %[step2_7],    %[step3_8]      \n\t"
+        "sub      %[step1_8],          %[step2_7],    %[step3_8]      \n\t"
+        "sub      %[step1_9],          %[step2_6],    %[step3_9]      \n\t"
+        "sub      %[step1_10],         %[step2_5],    %[step3_10]     \n\t"
+        "sub      %[step1_11],         %[step2_4],    %[step3_11]     \n\t"
+
+        : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8),
+          [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9),
+          [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10),
+          [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11)
+        : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11),
+          [step2_5] "r"(step2_5), [step3_10] "r"(step3_10),
+          [step2_6] "r"(step2_6), [step3_9] "r"(step3_9),
+          [step2_7] "r"(step2_7), [step3_8] "r"(step3_8));
 
     __asm__ __volatile__(
         "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
+        "add      %[temp1],             %[step2_27],    %[step2_20]     \n\t"
+        "sub      %[temp2],             %[step2_26],    %[step2_21]     \n\t"
+        "add      %[temp3],             %[step2_26],    %[step2_21]     \n\t"
+
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[temp2],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[temp3],       %[cospi_16_64]  \n\t"
+
         "extp     %[step1_20],          $ac0,           31              \n\t"
+        "extp     %[step1_27],          $ac1,           31              \n\t"
+        "extp     %[step1_21],          $ac2,           31              \n\t"
+        "extp     %[step1_26],          $ac3,           31              \n\t"
 
-        : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20),
+          [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21),
+          [step1_26] "=&r"(step1_26)
         : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_20 + step2_27) * cospi_16_64;
-    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_21],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
-          [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_21 + step2_26) * cospi_16_64;
-    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+          [step2_27] "r"(step2_27), [step2_21] "r"(step2_21),
+          [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64));
 
     __asm__ __volatile__(
         "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
+        "add      %[temp1],             %[step2_25],    %[step2_22]     \n\t"
+        "sub      %[temp2],             %[step2_24],    %[step2_23]     \n\t"
+        "add      %[temp3],             %[step2_24],    %[step2_23]     \n\t"
+
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[temp2],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[temp3],       %[cospi_16_64]  \n\t"
+
         "extp     %[step1_22],          $ac0,           31              \n\t"
+        "extp     %[step1_25],          $ac1,           31              \n\t"
+        "extp     %[step1_23],          $ac2,           31              \n\t"
+        "extp     %[step1_24],          $ac3,           31              \n\t"
 
-        : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
-          [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_22 + step2_25) * cospi_16_64;
-    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_23],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
-          [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_23 + step2_24) * cospi_16_64;
-    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22),
+          [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23),
+          [step1_24] "=&r"(step1_24)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22),
+          [step2_25] "r"(step2_25), [step2_23] "r"(step2_23),
+          [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64));
 
     __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
@@ -738,14 +816,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_1],         %[step2_30]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
         "add      %[temp0],         %[step1_2],         %[step2_29]     \n\t"
@@ -755,18 +833,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_3],         %[step2_28]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
+        : [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
           [step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
           [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
           [step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
@@ -782,29 +860,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
         "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+          [step3_15] "r"(step3_15));
 
     __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
@@ -815,14 +893,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_5],         %[step1_26]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
         "add      %[temp0],         %[step1_6],         %[step1_25]     \n\t"
@@ -832,18 +910,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_7],         %[step1_24]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4),
+        : [cm] "r"(cm), [stride] "r"(stride), [step1_4] "r"(step1_4),
           [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
           [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
           [step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
@@ -859,29 +937,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
         "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+          [step3_15] "r"(step3_15));
 
     __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
@@ -892,14 +970,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_9],         %[step1_22]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
         "add      %[temp0],         %[step1_10],        %[step1_21]     \n\t"
@@ -909,18 +987,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_11],        %[step1_20]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8),
+        : [cm] "r"(cm), [stride] "r"(stride), [step1_8] "r"(step1_8),
           [step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
           [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
           [step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
@@ -936,29 +1014,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
         "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+          [step3_15] "r"(step3_15));
 
     __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
@@ -969,14 +1047,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_13],        %[step2_18]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
         "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
         "add      %[temp0],         %[step1_14],        %[step2_17]     \n\t"
@@ -986,7 +1064,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "add      %[temp1],         %[step1_15],        %[step2_16]     \n\t"
         "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
+        "addu     %[dest_pix],      %[dest_pix],        %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
         "addi     %[temp1],         %[temp1],           32              \n\t"
         "sra      %[temp1],         %[temp1],           6               \n\t"
@@ -996,11 +1074,11 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
-          [step1_14] "r"(step1_14), [step1_15] "r"(step1_15),
-          [step2_16] "r"(step2_16), [step2_17] "r"(step2_17),
-          [step2_18] "r"(step2_18), [step2_19] "r"(step2_19));
+        : [cm] "r"(cm), [stride] "r"(stride), [step1_12] "r"(step1_12),
+          [step1_13] "r"(step1_13), [step1_14] "r"(step1_14),
+          [step1_15] "r"(step1_15), [step2_16] "r"(step2_16),
+          [step2_17] "r"(step2_17), [step2_18] "r"(step2_18),
+          [step2_19] "r"(step2_19));
 
     step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
     step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
@@ -1012,18 +1090,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
 
         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
         "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
         "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
+        "subu     %[dest_pix1],     %[dest_pix1],       %[stride]       \n\t"
         "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
         "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
@@ -1031,9 +1109,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
         : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
           [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
+        : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
+          [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
+          [step3_15] "r"(step3_15));
 
     input += 32;
   }
diff --git a/libs/libvpx/vpx_dsp/mips/itrans32_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans32_dspr2.c
index d71c5ffed5..3c0468c00f 100644
--- a/libs/libvpx/vpx_dsp/mips/itrans32_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/itrans32_dspr2.c
@@ -18,24 +18,23 @@
 #if HAVE_DSPR2
 static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
                               uint32_t no_rows) {
-  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
-  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
-  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
-  int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
-  int16_t step1_28, step1_29, step1_30, step1_31;
-  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
-  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
-  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
-  int16_t step2_28, step2_29, step2_30, step2_31;
-  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
-  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
-  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
-  int16_t step3_29, step3_30, step3_31;
+  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+  int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+  int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+  int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+  int step1_28, step1_29, step1_30, step1_31;
+  int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+  int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+  int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+  int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+  int step2_28, step2_29, step2_30, step2_31;
+  int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+  int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+  int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+  int step3_29, step3_30, step3_31;
   int temp0, temp1, temp2, temp3;
   int load1, load2, load3, load4;
   int result1, result2;
-  int temp21;
   int i;
   const int const_2_power_13 = 8192;
   const int32_t *input_int;
@@ -147,9 +146,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
-          [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
-          [step1_31] "=r"(step1_31)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17),
+          [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
           [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
@@ -207,9 +206,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
-          [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
-          [step1_29] "=r"(step1_29)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19),
+          [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
           [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
@@ -267,9 +266,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
-          [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
-          [step1_27] "=r"(step1_27)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21),
+          [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
           [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
@@ -289,7 +288,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
         "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
         "extp     %[temp0],             $ac1,           31              \n\t"
-
         "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
         "extp     %[temp3],             $ac3,           31              \n\t"
@@ -302,7 +300,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
         "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
         "extp     %[temp1],             $ac2,           31              \n\t"
-
         "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
         "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
         "extp     %[temp2],             $ac1,           31              \n\t"
@@ -314,12 +311,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
         "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-
         "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
         "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
         "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
-
         "extp     %[step1_22],          $ac1,           31              \n\t"
         "extp     %[step1_25],          $ac3,           31              \n\t"
         "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
@@ -327,9 +322,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
-          [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
-          [step1_25] "=r"(step1_25)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23),
+          [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
           [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
@@ -349,7 +344,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
         "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
         "extp     %[temp0],             $ac1,           31              \n\t"
-
         "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
         "extp     %[temp3],             $ac3,           31              \n\t"
@@ -362,7 +356,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
         "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
         "extp     %[temp1],             $ac2,           31              \n\t"
-
         "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
         "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
         "extp     %[temp2],             $ac1,           31              \n\t"
@@ -374,12 +367,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
         "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
-
         "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
         "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
         "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
-
         "extp     %[step2_9],           $ac1,           31              \n\t"
         "extp     %[step2_14],          $ac3,           31              \n\t"
         "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
@@ -387,9 +378,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
-          [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
-          [step2_15] "=r"(step2_15)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8),
+          [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14),
+          [step2_15] "=&r"(step2_15)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
           [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
@@ -409,7 +400,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
         "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
         "extp     %[temp0],             $ac1,           31              \n\t"
-
         "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
         "extp     %[temp3],             $ac3,           31              \n\t"
@@ -422,7 +412,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
         "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
         "extp     %[temp1],             $ac2,           31              \n\t"
-
         "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
         "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
         "extp     %[temp2],             $ac1,           31              \n\t"
@@ -434,12 +423,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
         "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-
         "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
         "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
         "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
-
         "extp     %[step2_10],          $ac1,           31              \n\t"
         "extp     %[step2_13],          $ac3,           31              \n\t"
         "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
@@ -447,9 +434,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
-          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
-          [step2_13] "=r"(step2_13)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
+          [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11),
+          [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
           [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
@@ -462,21 +449,18 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
         "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-
         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
         "mthi     $zero,                $ac1                            \n\t"
         "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
         "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
         "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
         "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
-
         "mtlo     %[const_2_power_13],  $ac2                            \n\t"
         "mthi     $zero,                $ac2                            \n\t"
         "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
         "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
         "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
         "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
-
         "mtlo     %[const_2_power_13],  $ac3                            \n\t"
         "mthi     $zero,                $ac3                            \n\t"
         "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
@@ -488,122 +472,159 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
         "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
         "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
-
         "extp     %[step3_10],          $ac0,           31              \n\t"
         "extp     %[step3_13],          $ac1,           31              \n\t"
         "extp     %[step3_11],          $ac2,           31              \n\t"
         "extp     %[step3_12],          $ac3,           31              \n\t"
 
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
-          [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
-          [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
-          [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
-          [step3_15] "=r"(step3_15)
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8),
+          [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10),
+          [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12),
+          [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14),
+          [step3_15] "=&r"(step3_15)
         : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
           [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
           [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
           [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
           [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
 
-    step2_18 = step1_17 - step1_18;
-    step2_29 = step1_30 - step1_29;
-
     __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_18],          $ac0,           31              \n\t"
-
-        : [step3_18] "=r"(step3_18)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
-          [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
-    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_19 = step1_16 - step1_19;
-    step2_28 = step1_31 - step1_28;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_19],          $ac0,           31              \n\t"
-
-        : [step3_19] "=r"(step3_19)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
-          [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
-    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step3_16 = step1_16 + step1_19;
-    step3_17 = step1_17 + step1_18;
-    step3_30 = step1_29 + step1_30;
-    step3_31 = step1_28 + step1_31;
-
-    step2_20 = step1_23 - step1_20;
-    step2_27 = step1_24 - step1_27;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_20],          $ac0,           31              \n\t"
-
-        : [step3_20] "=r"(step3_20)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
-    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_21 = step1_22 - step1_21;
-    step2_26 = step1_25 - step1_26;
-
-    __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
         "mthi     $zero,                $ac1                            \n\t"
-        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_21],          $ac1,           31              \n\t"
+        "sub      %[temp0],             %[step1_17],    %[step1_18]     \n\t"
+        "sub      %[temp1],             %[step1_30],    %[step1_29]     \n\t"
+        "add      %[step3_17],          %[step1_17],    %[step1_18]     \n\t"
+        "add      %[step3_30],          %[step1_30],    %[step1_29]     \n\t"
 
-        : [step3_21] "=r"(step3_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
-          [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
+        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_18],          $ac0,           31              \n\t"
+        "madd     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_29],          $ac1,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29),
+          [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17),
+          [step1_18] "r"(step1_18), [step1_30] "r"(step1_30),
+          [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64),
           [cospi_8_64] "r"(cospi_8_64));
 
-    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
-    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+    __asm__ __volatile__(
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp0],             %[step1_16],    %[step1_19]     \n\t"
+        "sub      %[temp1],             %[step1_31],    %[step1_28]     \n\t"
+        "add      %[step3_16],          %[step1_16],    %[step1_19]     \n\t"
+        "add      %[step3_31],          %[step1_31],    %[step1_28]     \n\t"
 
-    step3_22 = step1_21 + step1_22;
-    step3_23 = step1_20 + step1_23;
-    step3_24 = step1_24 + step1_27;
-    step3_25 = step1_25 + step1_26;
+        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_19],          $ac0,           31              \n\t"
+        "madd     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_28],          $ac1,           31              \n\t"
 
-    step2_16 = step3_16 + step3_23;
-    step2_17 = step3_17 + step3_22;
-    step2_18 = step3_18 + step3_21;
-    step2_19 = step3_19 + step3_20;
-    step2_20 = step3_19 - step3_20;
-    step2_21 = step3_18 - step3_21;
-    step2_22 = step3_17 - step3_22;
-    step2_23 = step3_16 - step3_23;
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31),
+          [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16),
+          [step1_19] "r"(step1_19), [step1_31] "r"(step1_31),
+          [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64));
 
-    step2_24 = step3_31 - step3_24;
-    step2_25 = step3_30 - step3_25;
-    step2_26 = step3_29 - step3_26;
-    step2_27 = step3_28 - step3_27;
-    step2_28 = step3_28 + step3_27;
-    step2_29 = step3_29 + step3_26;
-    step2_30 = step3_30 + step3_25;
-    step2_31 = step3_31 + step3_24;
+    __asm__ __volatile__(
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp0],             %[step1_23],    %[step1_20]     \n\t"
+        "sub      %[temp1],             %[step1_24],    %[step1_27]     \n\t"
+        "add      %[step3_23],          %[step1_23],    %[step1_20]     \n\t"
+        "add      %[step3_24],          %[step1_24],    %[step1_27]     \n\t"
+
+        "msub     $ac0,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac0,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_27],          $ac0,           31              \n\t"
+        "msub     $ac1,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac1,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_20],          $ac1,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24),
+          [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23),
+          [step1_20] "r"(step1_20), [step1_24] "r"(step1_24),
+          [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64));
+
+    __asm__ __volatile__(
+        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
+        "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "sub      %[temp0],             %[step1_22],    %[step1_21]     \n\t"
+        "sub      %[temp1],             %[step1_25],    %[step1_26]     \n\t"
+        "add      %[step3_22],          %[step1_22],    %[step1_21]     \n\t"
+        "add      %[step3_25],          %[step1_25],    %[step1_26]     \n\t"
+
+        "msub     $ac0,                 %[temp0],       %[cospi_24_64]  \n\t"
+        "msub     $ac0,                 %[temp1],       %[cospi_8_64]   \n\t"
+        "extp     %[step3_21],          $ac0,           31              \n\t"
+        "msub     $ac1,                 %[temp0],       %[cospi_8_64]   \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_24_64]  \n\t"
+        "extp     %[step3_26],          $ac1,           31              \n\t"
+
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25),
+          [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26)
+        : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22),
+          [step1_21] "r"(step1_21), [step1_25] "r"(step1_25),
+          [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64));
+
+    __asm__ __volatile__(
+        "add      %[step2_16],          %[step3_16],    %[step3_23]     \n\t"
+        "add      %[step2_17],          %[step3_17],    %[step3_22]     \n\t"
+        "add      %[step2_18],          %[step3_18],    %[step3_21]     \n\t"
+        "add      %[step2_19],          %[step3_19],    %[step3_20]     \n\t"
+        "sub      %[step2_20],          %[step3_19],    %[step3_20]     \n\t"
+        "sub      %[step2_21],          %[step3_18],    %[step3_21]     \n\t"
+        "sub      %[step2_22],          %[step3_17],    %[step3_22]     \n\t"
+        "sub      %[step2_23],          %[step3_16],    %[step3_23]     \n\t"
+
+        : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17),
+          [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19),
+          [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21),
+          [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23)
+        : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23),
+          [step3_17] "r"(step3_17), [step3_22] "r"(step3_22),
+          [step3_18] "r"(step3_18), [step3_21] "r"(step3_21),
+          [step3_19] "r"(step3_19), [step3_20] "r"(step3_20));
+
+    __asm__ __volatile__(
+        "sub      %[step2_24],          %[step3_31],    %[step3_24]     \n\t"
+        "sub      %[step2_25],          %[step3_30],    %[step3_25]     \n\t"
+        "sub      %[step2_26],          %[step3_29],    %[step3_26]     \n\t"
+        "sub      %[step2_27],          %[step3_28],    %[step3_27]     \n\t"
+        "add      %[step2_28],          %[step3_28],    %[step3_27]     \n\t"
+        "add      %[step2_29],          %[step3_29],    %[step3_26]     \n\t"
+        "add      %[step2_30],          %[step3_30],    %[step3_25]     \n\t"
+        "add      %[step2_31],          %[step3_31],    %[step3_24]     \n\t"
+
+        : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28),
+          [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29),
+          [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30),
+          [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31)
+        : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24),
+          [step3_30] "r"(step3_30), [step3_25] "r"(step3_25),
+          [step3_29] "r"(step3_29), [step3_26] "r"(step3_26),
+          [step3_28] "r"(step3_28), [step3_27] "r"(step3_27));
 
     __asm__ __volatile__(
         "lh       %[load1],             0(%[input])                     \n\t"
@@ -627,29 +648,25 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
         "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
         "extp     %[temp2],             $ac3,           31              \n\t"
-
         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
         "mthi     $zero,                $ac1                            \n\t"
         "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
         "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
         "extp     %[temp3],             $ac1,           31              \n\t"
-
-        "add      %[step1_0],          %[temp0],        %[temp3]        \n\t"
-        "add      %[step1_1],          %[temp1],        %[temp2]        \n\t"
-        "sub      %[step1_2],          %[temp1],        %[temp2]        \n\t"
-        "sub      %[step1_3],          %[temp0],        %[temp3]        \n\t"
+        "add      %[step1_0],           %[temp0],       %[temp3]        \n\t"
+        "add      %[step1_1],           %[temp1],       %[temp2]        \n\t"
+        "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"
+        "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [result1] "=&r"(result1),
           [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
-          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
-          [step1_3] "=r"(step1_3)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0),
+          [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2),
+          [step1_3] "=&r"(step1_3)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64)
-
-            );
+          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+          [cospi_16_64] "r"(cospi_16_64));
 
     __asm__ __volatile__(
         "lh       %[load1],             8(%[input])                     \n\t"
@@ -665,7 +682,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
         "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
         "extp     %[temp0],             $ac1,           31              \n\t"
-
         "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
         "extp     %[temp3],             $ac3,           31              \n\t"
@@ -678,7 +694,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
         "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
         "extp     %[temp1],             $ac2,           31              \n\t"
-
         "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
         "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
         "extp     %[temp2],             $ac1,           31              \n\t"
@@ -691,11 +706,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
         "sub      %[load1],             %[load1],       %[temp0]        \n\t"
         "add      %[load1],             %[load1],       %[temp1]        \n\t"
-
         "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
         "sub      %[load2],             %[load2],       %[temp2]        \n\t"
         "add      %[load2],             %[load2],       %[temp3]        \n\t"
-
         "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
         "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
 
@@ -706,129 +719,246 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
         : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
           [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
-          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
-          [step1_7] "=r"(step1_7)
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4),
+          [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6),
+          [step1_7] "=&r"(step1_7)
         : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
           [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
           [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
           [cospi_16_64] "r"(cospi_16_64));
 
-    step2_0 = step1_0 + step1_7;
-    step2_1 = step1_1 + step1_6;
-    step2_2 = step1_2 + step1_5;
-    step2_3 = step1_3 + step1_4;
-    step2_4 = step1_3 - step1_4;
-    step2_5 = step1_2 - step1_5;
-    step2_6 = step1_1 - step1_6;
-    step2_7 = step1_0 - step1_7;
+    __asm__ __volatile__(
+        "add      %[step2_0],          %[step1_0],    %[step1_7]     \n\t"
+        "add      %[step2_1],          %[step1_1],    %[step1_6]     \n\t"
+        "add      %[step2_2],          %[step1_2],    %[step1_5]     \n\t"
+        "add      %[step2_3],          %[step1_3],    %[step1_4]     \n\t"
+        "sub      %[step2_4],          %[step1_3],    %[step1_4]     \n\t"
+        "sub      %[step2_5],          %[step1_2],    %[step1_5]     \n\t"
+        "sub      %[step2_6],          %[step1_1],    %[step1_6]     \n\t"
+        "sub      %[step2_7],          %[step1_0],    %[step1_7]     \n\t"
 
-    step1_0 = step2_0 + step3_15;
-    step1_1 = step2_1 + step3_14;
-    step1_2 = step2_2 + step3_13;
-    step1_3 = step2_3 + step3_12;
-    step1_4 = step2_4 + step3_11;
-    step1_5 = step2_5 + step3_10;
-    step1_6 = step2_6 + step3_9;
-    step1_7 = step2_7 + step3_8;
-    step1_8 = step2_7 - step3_8;
-    step1_9 = step2_6 - step3_9;
-    step1_10 = step2_5 - step3_10;
-    step1_11 = step2_4 - step3_11;
-    step1_12 = step2_3 - step3_12;
-    step1_13 = step2_2 - step3_13;
-    step1_14 = step2_1 - step3_14;
-    step1_15 = step2_0 - step3_15;
+        : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4),
+          [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5),
+          [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6),
+          [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7)
+        : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7),
+          [step1_1] "r"(step1_1), [step1_6] "r"(step1_6),
+          [step1_2] "r"(step1_2), [step1_5] "r"(step1_5),
+          [step1_3] "r"(step1_3), [step1_4] "r"(step1_4));
+
+    // stage 7
+    __asm__ __volatile__(
+        "add      %[step1_0],          %[step2_0],    %[step3_15]     \n\t"
+        "add      %[step1_1],          %[step2_1],    %[step3_14]     \n\t"
+        "add      %[step1_2],          %[step2_2],    %[step3_13]     \n\t"
+        "add      %[step1_3],          %[step2_3],    %[step3_12]     \n\t"
+        "sub      %[step1_12],         %[step2_3],    %[step3_12]     \n\t"
+        "sub      %[step1_13],         %[step2_2],    %[step3_13]     \n\t"
+        "sub      %[step1_14],         %[step2_1],    %[step3_14]     \n\t"
+        "sub      %[step1_15],         %[step2_0],    %[step3_15]     \n\t"
+
+        : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12),
+          [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13),
+          [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14),
+          [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15)
+        : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15),
+          [step2_1] "r"(step2_1), [step3_14] "r"(step3_14),
+          [step2_2] "r"(step2_2), [step3_13] "r"(step3_13),
+          [step2_3] "r"(step2_3), [step3_12] "r"(step3_12));
+
+    __asm__ __volatile__(
+        "add      %[step1_4],          %[step2_4],    %[step3_11]     \n\t"
+        "add      %[step1_5],          %[step2_5],    %[step3_10]     \n\t"
+        "add      %[step1_6],          %[step2_6],    %[step3_9]      \n\t"
+        "add      %[step1_7],          %[step2_7],    %[step3_8]      \n\t"
+        "sub      %[step1_8],          %[step2_7],    %[step3_8]      \n\t"
+        "sub      %[step1_9],          %[step2_6],    %[step3_9]      \n\t"
+        "sub      %[step1_10],         %[step2_5],    %[step3_10]     \n\t"
+        "sub      %[step1_11],         %[step2_4],    %[step3_11]     \n\t"
+
+        : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8),
+          [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9),
+          [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10),
+          [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11)
+        : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11),
+          [step2_5] "r"(step2_5), [step3_10] "r"(step3_10),
+          [step2_6] "r"(step2_6), [step3_9] "r"(step3_9),
+          [step2_7] "r"(step2_7), [step3_8] "r"(step3_8));
 
     __asm__ __volatile__(
         "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
+        "add      %[temp1],             %[step2_27],    %[step2_20]     \n\t"
+        "sub      %[temp2],             %[step2_26],    %[step2_21]     \n\t"
+        "add      %[temp3],             %[step2_26],    %[step2_21]     \n\t"
+
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[temp2],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[temp3],       %[cospi_16_64]  \n\t"
+
         "extp     %[step1_20],          $ac0,           31              \n\t"
+        "extp     %[step1_27],          $ac1,           31              \n\t"
+        "extp     %[step1_21],          $ac2,           31              \n\t"
+        "extp     %[step1_26],          $ac3,           31              \n\t"
 
-        : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20),
+          [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21),
+          [step1_26] "=&r"(step1_26)
         : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_20 + step2_27) * cospi_16_64;
-    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_21],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
-          [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_21 + step2_26) * cospi_16_64;
-    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+          [step2_27] "r"(step2_27), [step2_21] "r"(step2_21),
+          [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64));
 
     __asm__ __volatile__(
         "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
+        "add      %[temp1],             %[step2_25],    %[step2_22]     \n\t"
+        "sub      %[temp2],             %[step2_24],    %[step2_23]     \n\t"
+        "add      %[temp3],             %[step2_24],    %[step2_23]     \n\t"
+
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
+        "mthi     $zero,                $ac1                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
+        "mthi     $zero,                $ac2                            \n\t"
+        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
+        "mthi     $zero,                $ac3                            \n\t"
+
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
+        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
+        "madd     $ac2,                 %[temp2],       %[cospi_16_64]  \n\t"
+        "madd     $ac3,                 %[temp3],       %[cospi_16_64]  \n\t"
+
         "extp     %[step1_22],          $ac0,           31              \n\t"
+        "extp     %[step1_25],          $ac1,           31              \n\t"
+        "extp     %[step1_23],          $ac2,           31              \n\t"
+        "extp     %[step1_24],          $ac3,           31              \n\t"
 
-        : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
-          [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_22 + step2_25) * cospi_16_64;
-    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_23],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
-          [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_23 + step2_24) * cospi_16_64;
-    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22),
+          [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23),
+          [step1_24] "=&r"(step1_24)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22),
+          [step2_25] "r"(step2_25), [step2_23] "r"(step2_23),
+          [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64));
 
     // final stage
-    output[0 * 32] = step1_0 + step2_31;
-    output[1 * 32] = step1_1 + step2_30;
-    output[2 * 32] = step1_2 + step2_29;
-    output[3 * 32] = step1_3 + step2_28;
-    output[4 * 32] = step1_4 + step1_27;
-    output[5 * 32] = step1_5 + step1_26;
-    output[6 * 32] = step1_6 + step1_25;
-    output[7 * 32] = step1_7 + step1_24;
-    output[8 * 32] = step1_8 + step1_23;
-    output[9 * 32] = step1_9 + step1_22;
-    output[10 * 32] = step1_10 + step1_21;
-    output[11 * 32] = step1_11 + step1_20;
-    output[12 * 32] = step1_12 + step2_19;
-    output[13 * 32] = step1_13 + step2_18;
-    output[14 * 32] = step1_14 + step2_17;
-    output[15 * 32] = step1_15 + step2_16;
-    output[16 * 32] = step1_15 - step2_16;
-    output[17 * 32] = step1_14 - step2_17;
-    output[18 * 32] = step1_13 - step2_18;
-    output[19 * 32] = step1_12 - step2_19;
-    output[20 * 32] = step1_11 - step1_20;
-    output[21 * 32] = step1_10 - step1_21;
-    output[22 * 32] = step1_9 - step1_22;
-    output[23 * 32] = step1_8 - step1_23;
-    output[24 * 32] = step1_7 - step1_24;
-    output[25 * 32] = step1_6 - step1_25;
-    output[26 * 32] = step1_5 - step1_26;
-    output[27 * 32] = step1_4 - step1_27;
-    output[28 * 32] = step1_3 - step2_28;
-    output[29 * 32] = step1_2 - step2_29;
-    output[30 * 32] = step1_1 - step2_30;
-    output[31 * 32] = step1_0 - step2_31;
+    __asm__ __volatile__(
+        "add      %[temp0],            %[step1_0],    %[step2_31]     \n\t"
+        "add      %[temp1],            %[step1_1],    %[step2_30]     \n\t"
+        "add      %[temp2],            %[step1_2],    %[step2_29]     \n\t"
+        "add      %[temp3],            %[step1_3],    %[step2_28]     \n\t"
+        "sub      %[load1],            %[step1_3],    %[step2_28]     \n\t"
+        "sub      %[load2],            %[step1_2],    %[step2_29]     \n\t"
+        "sub      %[load3],            %[step1_1],    %[step2_30]     \n\t"
+        "sub      %[load4],            %[step1_0],    %[step2_31]     \n\t"
+        "sh       %[temp0],          0(%[output])                     \n\t"
+        "sh       %[temp1],         64(%[output])                     \n\t"
+        "sh       %[temp2],        128(%[output])                     \n\t"
+        "sh       %[temp3],        192(%[output])                     \n\t"
+        "sh       %[load1],       1792(%[output])                     \n\t"
+        "sh       %[load2],       1856(%[output])                     \n\t"
+        "sh       %[load3],       1920(%[output])                     \n\t"
+        "sh       %[load4],       1984(%[output])                     \n\t"
+
+        : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+          [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+          [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+        : [step1_0] "r"(step1_0), [step2_31] "r"(step2_31),
+          [step1_1] "r"(step1_1), [step2_30] "r"(step2_30),
+          [step1_2] "r"(step1_2), [step2_29] "r"(step2_29),
+          [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
+          [output] "r"(output));
+
+    __asm__ __volatile__(
+        "add      %[temp0],            %[step1_4],    %[step1_27]     \n\t"
+        "add      %[temp1],            %[step1_5],    %[step1_26]     \n\t"
+        "add      %[temp2],            %[step1_6],    %[step1_25]     \n\t"
+        "add      %[temp3],            %[step1_7],    %[step1_24]     \n\t"
+        "sub      %[load1],            %[step1_7],    %[step1_24]     \n\t"
+        "sub      %[load2],            %[step1_6],    %[step1_25]     \n\t"
+        "sub      %[load3],            %[step1_5],    %[step1_26]     \n\t"
+        "sub      %[load4],            %[step1_4],    %[step1_27]     \n\t"
+        "sh       %[temp0],        256(%[output])                     \n\t"
+        "sh       %[temp1],        320(%[output])                     \n\t"
+        "sh       %[temp2],        384(%[output])                     \n\t"
+        "sh       %[temp3],        448(%[output])                     \n\t"
+        "sh       %[load1],       1536(%[output])                     \n\t"
+        "sh       %[load2],       1600(%[output])                     \n\t"
+        "sh       %[load3],       1664(%[output])                     \n\t"
+        "sh       %[load4],       1728(%[output])                     \n\t"
+
+        : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+          [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+          [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+        : [step1_4] "r"(step1_4), [step1_27] "r"(step1_27),
+          [step1_5] "r"(step1_5), [step1_26] "r"(step1_26),
+          [step1_6] "r"(step1_6), [step1_25] "r"(step1_25),
+          [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
+          [output] "r"(output));
+
+    __asm__ __volatile__(
+        "add      %[temp0],            %[step1_8],     %[step1_23]     \n\t"
+        "add      %[temp1],            %[step1_9],     %[step1_22]     \n\t"
+        "add      %[temp2],            %[step1_10],    %[step1_21]     \n\t"
+        "add      %[temp3],            %[step1_11],    %[step1_20]     \n\t"
+        "sub      %[load1],            %[step1_11],    %[step1_20]     \n\t"
+        "sub      %[load2],            %[step1_10],    %[step1_21]     \n\t"
+        "sub      %[load3],            %[step1_9],     %[step1_22]     \n\t"
+        "sub      %[load4],            %[step1_8],     %[step1_23]     \n\t"
+        "sh       %[temp0],        512(%[output])                      \n\t"
+        "sh       %[temp1],        576(%[output])                      \n\t"
+        "sh       %[temp2],        640(%[output])                      \n\t"
+        "sh       %[temp3],        704(%[output])                      \n\t"
+        "sh       %[load1],       1280(%[output])                      \n\t"
+        "sh       %[load2],       1344(%[output])                      \n\t"
+        "sh       %[load3],       1408(%[output])                      \n\t"
+        "sh       %[load4],       1472(%[output])                      \n\t"
+
+        : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+          [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+          [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+        : [step1_8] "r"(step1_8), [step1_23] "r"(step1_23),
+          [step1_9] "r"(step1_9), [step1_22] "r"(step1_22),
+          [step1_10] "r"(step1_10), [step1_21] "r"(step1_21),
+          [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
+          [output] "r"(output));
+
+    __asm__ __volatile__(
+        "add      %[temp0],            %[step1_12],    %[step2_19]     \n\t"
+        "add      %[temp1],            %[step1_13],    %[step2_18]     \n\t"
+        "add      %[temp2],            %[step1_14],    %[step2_17]     \n\t"
+        "add      %[temp3],            %[step1_15],    %[step2_16]     \n\t"
+        "sub      %[load1],            %[step1_15],    %[step2_16]     \n\t"
+        "sub      %[load2],            %[step1_14],    %[step2_17]     \n\t"
+        "sub      %[load3],            %[step1_13],    %[step2_18]     \n\t"
+        "sub      %[load4],            %[step1_12],    %[step2_19]     \n\t"
+        "sh       %[temp0],        768(%[output])                      \n\t"
+        "sh       %[temp1],        832(%[output])                      \n\t"
+        "sh       %[temp2],        896(%[output])                      \n\t"
+        "sh       %[temp3],        960(%[output])                      \n\t"
+        "sh       %[load1],       1024(%[output])                      \n\t"
+        "sh       %[load2],       1088(%[output])                      \n\t"
+        "sh       %[load3],       1152(%[output])                      \n\t"
+        "sh       %[load4],       1216(%[output])                      \n\t"
+
+        : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
+          [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
+          [temp3] "=&r"(temp3), [load4] "=&r"(load4)
+        : [step1_12] "r"(step1_12), [step2_19] "r"(step2_19),
+          [step1_13] "r"(step1_13), [step2_18] "r"(step2_18),
+          [step1_14] "r"(step1_14), [step2_17] "r"(step2_17),
+          [step1_15] "r"(step1_15), [step2_16] "r"(step2_16),
+          [output] "r"(output));
 
     input += 32;
     output += 1;
@@ -836,7 +966,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 }
 
 void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
-                                  int dest_stride) {
+                                  int stride) {
   DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
   int16_t *outptr = out;
   uint32_t pos = 45;
@@ -850,7 +980,7 @@ void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
   idct32_rows_dspr2(input, outptr, 32);
 
   // Columns
-  vpx_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
+  vpx_idct32_cols_add_blk_dspr2(out, dest, stride);
 }
 
 void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
@@ -941,7 +1071,7 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
         "abs        %[absa1],     %[a1]         \n\t"
         "replv.qb   %[vector_a1], %[absa1]      \n\t"
 
-        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+        : [absa1] "=&r"(absa1), [vector_a1] "=&r"(vector_a1)
         : [a1] "r"(a1));
 
     for (r = 32; r--;) {
@@ -980,12 +1110,71 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
             [dest] "+&r"(dest)
           : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
     }
+  } else if (a1 > 255) {
+    int32_t a11, a12, vector_a11, vector_a12;
+
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    a11 = a1 >> 1;
+    a12 = a1 - a11;
+    __asm__ __volatile__(
+        "replv.qb       %[vector_a11],  %[a11]     \n\t"
+        "replv.qb       %[vector_a12],  %[a12]     \n\t"
+
+        : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+        : [a11] "r"(a11), [a12] "r"(a12));
+
+    for (r = 32; r--;) {
+      __asm__ __volatile__(
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "lw             %[t3],          8(%[dest])                      \n\t"
+          "lw             %[t4],          12(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_3],    %[vector_3],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_4],    %[vector_4],    %[vector_a12]   \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "sw             %[vector_3],    8(%[dest])                      \n\t"
+          "sw             %[vector_4],    12(%[dest])                     \n\t"
+
+          "lw             %[t1],          16(%[dest])                     \n\t"
+          "lw             %[t2],          20(%[dest])                     \n\t"
+          "lw             %[t3],          24(%[dest])                     \n\t"
+          "lw             %[t4],          28(%[dest])                     \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a11]    \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a11]    \n\t"
+          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a11]    \n\t"
+          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a11]    \n\t"
+          "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_3],    %[vector_3],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_4],    %[vector_4],    %[vector_a12]   \n\t"
+          "sw             %[vector_1],    16(%[dest])                     \n\t"
+          "sw             %[vector_2],    20(%[dest])                     \n\t"
+          "sw             %[vector_3],    24(%[dest])                     \n\t"
+          "sw             %[vector_4],    28(%[dest])                     \n\t"
+
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+            [dest] "+&r"(dest)
+          : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+            [vector_a12] "r"(vector_a12));
+    }
   } else {
     /* use quad-byte
      * input and output memory are four byte aligned */
     __asm__ __volatile__("replv.qb       %[vector_a1],   %[a1]     \n\t"
 
-                         : [vector_a1] "=r"(vector_a1)
+                         : [vector_a1] "=&r"(vector_a1)
                          : [a1] "r"(a1));
 
     for (r = 32; r--;) {
diff --git a/libs/libvpx/vpx_dsp/mips/itrans4_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans4_dspr2.c
index 516ea80f4a..e214b538d4 100644
--- a/libs/libvpx/vpx_dsp/mips/itrans4_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/itrans4_dspr2.c
@@ -15,7 +15,7 @@
 
 #if HAVE_DSPR2
 void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
-  int16_t step_0, step_1, step_2, step_3;
+  int step_0, step_1, step_2, step_3;
   int Temp0, Temp1, Temp2, Temp3;
   const int const_2_power_13 = 8192;
   int i;
@@ -96,23 +96,13 @@ void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
 }
 
 void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                     int dest_stride) {
-  int16_t step_0, step_1, step_2, step_3;
+                                     int stride) {
+  int step_0, step_1, step_2, step_3;
   int Temp0, Temp1, Temp2, Temp3;
   const int const_2_power_13 = 8192;
+  const int const_255 = 255;
   int i;
   uint8_t *dest_pix;
-  uint8_t *cm = vpx_ff_cropTbl;
-
-  /* prefetch vpx_ff_cropTbl */
-  prefetch_load(vpx_ff_cropTbl);
-  prefetch_load(vpx_ff_cropTbl + 32);
-  prefetch_load(vpx_ff_cropTbl + 64);
-  prefetch_load(vpx_ff_cropTbl + 96);
-  prefetch_load(vpx_ff_cropTbl + 128);
-  prefetch_load(vpx_ff_cropTbl + 160);
-  prefetch_load(vpx_ff_cropTbl + 192);
-  prefetch_load(vpx_ff_cropTbl + 224);
 
   for (i = 0; i < 4; ++i) {
     dest_pix = (dest + i);
@@ -172,51 +162,62 @@ void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "sra      %[Temp0],             %[Temp0],       4               \n\t"
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
+
         "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
         "addi     %[Temp0],             %[Temp0],       8               \n\t"
         "sra      %[Temp0],             %[Temp0],       4               \n\t"
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
+
         "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
         "addi     %[Temp0],             %[Temp0],       8               \n\t"
         "sra      %[Temp0],             %[Temp0],       4               \n\t"
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
+
         "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
         "addi     %[Temp0],             %[Temp0],       8               \n\t"
         "sra      %[Temp0],             %[Temp0],       4               \n\t"
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
 
         : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
           [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
           [step_2] "=&r"(step_2), [step_3] "=&r"(step_3),
           [dest_pix] "+r"(dest_pix)
-        : [const_2_power_13] "r"(const_2_power_13),
+        : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255),
           [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
-          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
-          [dest_stride] "r"(dest_stride));
+          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input),
+          [stride] "r"(stride));
 
     input += 4;
   }
 }
 
-void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
+void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
   DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
   int16_t *outptr = out;
   uint32_t pos = 45;
@@ -230,11 +231,10 @@ void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
   vpx_idct4_rows_dspr2(input, outptr);
 
   // Columns
-  vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride);
 }
 
-void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride) {
+void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
   int a1, absa1;
   int r;
   int32_t out;
@@ -271,10 +271,43 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "lw             %[t2],          0(%[dest])                      \n\t"
           "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
           "sw             %[vector_a],    0(%[dest])                      \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
 
           : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+    }
+  } else if (a1 > 255) {
+    int32_t a11, a12, vector_a11, vector_a12;
+
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    a11 = a1 >> 3;
+    a12 = a1 - (a11 * 7);
+
+    __asm__ __volatile__(
+        "replv.qb       %[vector_a11],  %[a11]     \n\t"
+        "replv.qb       %[vector_a12],  %[a12]     \n\t"
+
+        : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+        : [a11] "r"(a11), [a12] "r"(a12));
+
+    for (r = 4; r--;) {
+      __asm__ __volatile__(
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "addu_s.qb      %[vector_a],    %[t2],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a12]   \n\t"
+          "sw             %[vector_a],    0(%[dest])                      \n\t"
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
+          : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+            [vector_a12] "r"(vector_a12));
     }
   } else {
     /* use quad-byte
@@ -288,10 +321,10 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "lw           %[t2],          0(%[dest])                        \n\t"
           "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
           "sw           %[vector_a],    0(%[dest])                        \n\t"
-          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
+          "add          %[dest],        %[dest],          %[stride]       \n\t"
 
           : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
     }
   }
 }
@@ -310,6 +343,7 @@ void iadst4_dspr2(const int16_t *input, int16_t *output) {
     return;
   }
 
+  // 32-bit result is enough for the following multiplications.
   s0 = sinpi_1_9 * x0;
   s1 = sinpi_2_9 * x0;
   s2 = sinpi_3_9 * x1;
diff --git a/libs/libvpx/vpx_dsp/mips/itrans8_dspr2.c b/libs/libvpx/vpx_dsp/mips/itrans8_dspr2.c
index 08a6c78b6e..d4d246965c 100644
--- a/libs/libvpx/vpx_dsp/mips/itrans8_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/itrans8_dspr2.c
@@ -192,24 +192,13 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
   }
 }
 
-void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
+void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int Temp0, Temp1, Temp2, Temp3;
   int i;
   const int const_2_power_13 = 8192;
+  const int const_255 = 255;
   uint8_t *dest_pix;
-  uint8_t *cm = vpx_ff_cropTbl;
-
-  /* prefetch vpx_ff_cropTbl */
-  prefetch_load(vpx_ff_cropTbl);
-  prefetch_load(vpx_ff_cropTbl + 32);
-  prefetch_load(vpx_ff_cropTbl + 64);
-  prefetch_load(vpx_ff_cropTbl + 96);
-  prefetch_load(vpx_ff_cropTbl + 128);
-  prefetch_load(vpx_ff_cropTbl + 160);
-  prefetch_load(vpx_ff_cropTbl + 192);
-  prefetch_load(vpx_ff_cropTbl + 224);
 
   for (i = 0; i < 8; ++i) {
     dest_pix = (dest + i);
@@ -356,70 +345,94 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
         "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "addi     %[Temp0],             %[Temp0],       16              \n\t"
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
         "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "addi     %[Temp0],             %[Temp0],       16              \n\t"
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
         "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "addi     %[Temp0],             %[Temp0],       16              \n\t"
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
         "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "addi     %[Temp0],             %[Temp0],       16              \n\t"
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
         "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "addi     %[Temp0],             %[Temp0],       16              \n\t"
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
         "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "addi     %[Temp0],             %[Temp0],       16              \n\t"
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
         "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
+        "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
 
         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
         "addi     %[Temp0],             %[Temp0],       16              \n\t"
         "sra      %[Temp0],             %[Temp0],       5               \n\t"
         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
+        "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
+        "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
+        "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
+        "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
+        "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
 
         : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
           [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
@@ -427,19 +440,18 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
           [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
           [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
           [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix)
-        : [const_2_power_13] "r"(const_2_power_13),
+        : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255),
           [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
           [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
           [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
-          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
-          [dest_stride] "r"(dest_stride));
+          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input),
+          [stride] "r"(stride));
 
     input += 8;
   }
 }
 
-void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
+void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
   int16_t *outptr = out;
   uint32_t pos = 45;
@@ -451,11 +463,10 @@ void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
   idct8_rows_dspr2(input, outptr, 8);
 
   // Then transform columns and add to dest
-  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  idct8_columns_add_blk_dspr2(&out[0], dest, stride);
 }
 
-void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
+void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
   int16_t *outptr = out;
   uint32_t pos = 45;
@@ -490,11 +501,10 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
       : [outptr] "r"(outptr));
 
   // Then transform columns and add to dest
-  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+  idct8_columns_add_blk_dspr2(&out[0], dest, stride);
 }
 
-void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride) {
+void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
   uint32_t pos = 45;
   int32_t out;
   int32_t r;
@@ -533,11 +543,47 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
           "sw           %[vector_1],    0(%[dest])                      \n\t"
           "sw           %[vector_2],    4(%[dest])                      \n\t"
-          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+          "add          %[dest],        %[dest],        %[stride]       \n\t"
 
           : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
             [vector_2] "=&r"(vector_2), [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
+    }
+  } else if (a1 > 255) {
+    int32_t a11, a12, vector_a11, vector_a12;
+
+    /* use quad-byte
+     * input and output memory are four byte aligned */
+    a11 = a1 >> 2;
+    a12 = a1 - (a11 * 3);
+
+    __asm__ __volatile__(
+        "replv.qb      %[vector_a11],  %[a11]     \n\t"
+        "replv.qb      %[vector_a12],  %[a12]     \n\t"
+
+        : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
+        : [a11] "r"(a11), [a12] "r"(a12));
+
+    for (r = 8; r--;) {
+      __asm__ __volatile__(
+          "lw             %[t1],          0(%[dest])                      \n\t"
+          "lw             %[t2],          4(%[dest])                      \n\t"
+          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a11]   \n\t"
+          "addu_s.qb      %[vector_1],    %[vector_1],    %[vector_a12]   \n\t"
+          "addu_s.qb      %[vector_2],    %[vector_2],    %[vector_a12]   \n\t"
+          "sw             %[vector_1],    0(%[dest])                      \n\t"
+          "sw             %[vector_2],    4(%[dest])                      \n\t"
+          "add            %[dest],        %[dest],        %[stride]       \n\t"
+
+          : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
+            [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
+          : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
+            [vector_a12] "r"(vector_a12));
     }
   } else {
     /* use quad-byte
@@ -555,11 +601,11 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
           "sw           %[vector_1],    0(%[dest])                      \n\t"
           "sw           %[vector_2],    4(%[dest])                      \n\t"
-          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
+          "add          %[dest],        %[dest],        %[stride]       \n\t"
 
           : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
             [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
+          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
     }
   }
 }
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_16_msa.c b/libs/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
index 2c8a10fc5d..b1731f2345 100644
--- a/libs/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
@@ -8,13 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_ports/mem.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/loopfilter_msa.h"
+#include "vpx_ports/mem.h"
 
-int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
-                                 const uint8_t *b_limit_ptr,
-                                 const uint8_t *limit_ptr,
-                                 const uint8_t *thresh_ptr) {
+static int32_t hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
+                                    uint8_t *filter48,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
   v16u8 flat, mask, hev, thresh, b_limit, limit;
@@ -77,7 +79,7 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
   }
 }
 
-void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
+static void hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
   v16u8 flat, flat2, filter8;
   v16i8 zero = { 0 };
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -403,20 +405,21 @@ void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
   }
 }
 
-void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
-                                    const uint8_t *b_limit_ptr,
-                                    const uint8_t *limit_ptr,
-                                    const uint8_t *thresh_ptr, int32_t count) {
+static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
+                                        const uint8_t *b_limit_ptr,
+                                        const uint8_t *limit_ptr,
+                                        const uint8_t *thresh_ptr,
+                                        int32_t count) {
   DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
   uint8_t early_exit = 0;
 
   (void)count;
 
-  early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
-                                        limit_ptr, thresh_ptr);
+  early_exit = hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
+                                    limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    vpx_hz_lpf_t16_16w(src, pitch, filter48);
+    hz_lpf_t16_16w(src, pitch, filter48);
   }
 }
 
@@ -448,7 +451,7 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
                  mask, flat);
     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-    VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
                        q1_out);
 
     flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
@@ -638,19 +641,19 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
       }
     }
   } else {
-    vpx_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
-                                   thresh_ptr, count);
+    mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
+                                count);
   }
 }
 
-void vpx_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch,
-                                   const uint8_t *b_limit_ptr,
-                                   const uint8_t *limit_ptr,
-                                   const uint8_t *thresh_ptr) {
+void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
+                               const uint8_t *b_limit_ptr,
+                               const uint8_t *limit_ptr,
+                               const uint8_t *thresh_ptr) {
   mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
 }
 
-void vpx_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
                                     const uint8_t *b_limit_ptr,
                                     const uint8_t *limit_ptr,
                                     const uint8_t *thresh_ptr) {
@@ -752,11 +755,11 @@ static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
 }
 
-int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
-                                uint8_t *src_org, int32_t pitch_org,
-                                const uint8_t *b_limit_ptr,
-                                const uint8_t *limit_ptr,
-                                const uint8_t *thresh_ptr) {
+static int32_t vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+                                   uint8_t *src_org, int32_t pitch_org,
+                                   const uint8_t *b_limit_ptr,
+                                   const uint8_t *limit_ptr,
+                                   const uint8_t *thresh_ptr) {
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
   v16u8 flat, mask, hev, thresh, b_limit, limit;
@@ -778,7 +781,7 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
   /* flat4 */
   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
   /* filter4 */
-  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 
   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
 
@@ -819,8 +822,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
   }
 }
 
-int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
-                          uint8_t *filter48) {
+static int32_t vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                             uint8_t *filter48) {
   v16i8 zero = { 0 };
   v16u8 filter8, flat, flat2;
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -1050,12 +1053,12 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
   transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
 
   early_exit =
-      vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
-                              pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+      vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, pitch,
+                          b_limit_ptr, limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
-                                   &filter48[0]);
+    early_exit =
+        vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
 
     if (0 == early_exit) {
       transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
@@ -1063,11 +1066,11 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
   }
 }
 
-int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
-                                 uint8_t *src_org, int32_t pitch,
-                                 const uint8_t *b_limit_ptr,
-                                 const uint8_t *limit_ptr,
-                                 const uint8_t *thresh_ptr) {
+static int32_t vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+                                    uint8_t *src_org, int32_t pitch,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
   v16u8 flat, mask, hev, thresh, b_limit, limit;
@@ -1140,8 +1143,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
   }
 }
 
-int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
-                           uint8_t *filter48) {
+static int32_t vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                              uint8_t *filter48) {
   v16u8 flat, flat2, filter8;
   v16i8 zero = { 0 };
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -1472,12 +1475,12 @@ void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
   transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
 
   early_exit =
-      vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
-                               pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+      vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+                           pitch, b_limit_ptr, limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
-                                    &filter48[0]);
+    early_exit =
+        vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
 
     if (0 == early_exit) {
       transpose_16x16(transposed_input, 16, (src - 8), pitch);
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_4_msa.c b/libs/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
index e0079665f7..0eff2b6ca9 100644
--- a/libs/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/loopfilter_msa.h"
 
 void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
@@ -27,7 +28,7 @@ void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
 
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
                mask, flat);
-  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 
   p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
   p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
@@ -86,7 +87,7 @@ void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
                      q3);
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
                mask, flat);
-  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
   ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
   ILVRL_H2_SH(vec1, vec0, vec2, vec3);
 
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_8_msa.c b/libs/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
index 403e5dc51b..703fcce8a7 100644
--- a/libs/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/loopfilter_msa.h"
 
 void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
@@ -32,7 +33,7 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
                mask, flat);
   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 
   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
 
@@ -177,7 +178,7 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
   /* flat4 */
   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
   /* filter4 */
-  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 
   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
 
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
index 6325762a2a..9c1f5143f2 100644
--- a/libs/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
@@ -717,14 +717,13 @@ static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
   }
 }
 
-void vpx_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch,
-                                     const uint8_t *blimit,
-                                     const uint8_t *limit,
-                                     const uint8_t *thresh) {
+void vpx_lpf_horizontal_16_dspr2(unsigned char *s, int pitch,
+                                 const uint8_t *blimit, const uint8_t *limit,
+                                 const uint8_t *thresh) {
   mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
 }
 
-void vpx_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch,
+void vpx_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch,
                                       const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh) {
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h b/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h
index 4063e5e6b8..49fd74c25a 100644
--- a/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h
@@ -13,144 +13,71 @@
 
 #include "vpx_dsp/mips/macros_msa.h"
 
-#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
-                           p1_out, p0_out, q0_out, q1_out)              \
-  {                                                                     \
-    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
-    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
-    v8i16 q0_sub_p0_r, filt_r, cnst3h;                                  \
-                                                                        \
-    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                            \
-    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                            \
-    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                            \
-    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                            \
-                                                                        \
-    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
-    filt = filt & (v16i8)hev_in;                                        \
-    q0_sub_p0 = q0_m - p0_m;                                            \
-    filt_sign = __msa_clti_s_b(filt, 0);                                \
-                                                                        \
-    cnst3h = __msa_ldi_h(3);                                            \
-    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
-    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
-    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
-    filt_r += q0_sub_p0_r;                                              \
-    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
-                                                                        \
-    /* combine left and right part */                                   \
-    filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r);                 \
-                                                                        \
-    filt = filt & (v16i8)mask_in;                                       \
-    cnst4b = __msa_ldi_b(4);                                            \
-    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
-    filt1 >>= 3;                                                        \
-                                                                        \
-    cnst3b = __msa_ldi_b(3);                                            \
-    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
-    filt2 >>= 3;                                                        \
-                                                                        \
-    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
-    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                           \
-    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
-    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                           \
-                                                                        \
-    filt = __msa_srari_b(filt1, 1);                                     \
-    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
-    filt = filt & (v16i8)hev_in;                                        \
-                                                                        \
-    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
-    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                           \
-    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
-    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \
+                           p0_out, q0_out, q1_out)                        \
+  {                                                                       \
+    v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2;                \
+    const v16i8 cnst4b = __msa_ldi_b(4);                                  \
+    const v16i8 cnst3b = __msa_ldi_b(3);                                  \
+                                                                          \
+    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                              \
+    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                              \
+    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                              \
+    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                              \
+                                                                          \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                    \
+    filt &= hev;                                                          \
+    q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);                               \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                               \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                               \
+    filt = __msa_adds_s_b(filt, q0_sub_p0);                               \
+    filt &= mask;                                                         \
+    t1 = __msa_adds_s_b(filt, cnst4b);                                    \
+    t1 >>= cnst3b;                                                        \
+    t2 = __msa_adds_s_b(filt, cnst3b);                                    \
+    t2 >>= cnst3b;                                                        \
+    q0_m = __msa_subs_s_b(q0_m, t1);                                      \
+    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                             \
+    p0_m = __msa_adds_s_b(p0_m, t2);                                      \
+    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                             \
+    filt = __msa_srari_b(t1, 1);                                          \
+    hev = __msa_xori_b(hev, 0xff);                                        \
+    filt &= hev;                                                          \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                    \
+    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                             \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                    \
+    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                             \
   }
 
-#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
-                           p1_out, p0_out, q0_out, q1_out)              \
-  {                                                                     \
-    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
-    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
-    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
-                                                                        \
-    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                            \
-    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                            \
-    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                            \
-    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                            \
-                                                                        \
-    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
-                                                                        \
-    filt = filt & (v16i8)hev_in;                                        \
-                                                                        \
-    q0_sub_p0 = q0_m - p0_m;                                            \
-    filt_sign = __msa_clti_s_b(filt, 0);                                \
-                                                                        \
-    cnst3h = __msa_ldi_h(3);                                            \
-    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
-    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
-    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
-    filt_r += q0_sub_p0_r;                                              \
-    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
-                                                                        \
-    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);            \
-    q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);    \
-    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                      \
-    filt_l += q0_sub_p0_l;                                              \
-    filt_l = __msa_sat_s_h(filt_l, 7);                                  \
-                                                                        \
-    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                 \
-    filt = filt & (v16i8)mask_in;                                       \
-                                                                        \
-    cnst4b = __msa_ldi_b(4);                                            \
-    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
-    filt1 >>= 3;                                                        \
-                                                                        \
-    cnst3b = __msa_ldi_b(3);                                            \
-    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
-    filt2 >>= 3;                                                        \
-                                                                        \
-    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
-    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                           \
-    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
-    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                           \
-                                                                        \
-    filt = __msa_srari_b(filt1, 1);                                     \
-    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
-    filt = filt & (v16i8)hev_in;                                        \
-                                                                        \
-    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
-    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                           \
-    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
-    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
-  }
-
-#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
-  {                                                                   \
-    v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;    \
-    v16u8 zero_in = { 0 };                                            \
-                                                                      \
-    tmp = __msa_ori_b(zero_in, 1);                                    \
-    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                       \
-    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                       \
-    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                       \
-    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                       \
-                                                                      \
-    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);            \
-    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                  \
-    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);            \
-    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                  \
-                                                                      \
-    flat_out = (tmp < (v16u8)flat_out);                               \
-    flat_out = __msa_xori_b(flat_out, 0xff);                          \
-    flat_out = flat_out & (mask);                                     \
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)    \
+  {                                                                      \
+    v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
+    v16u8 zero_in = { 0 };                                               \
+                                                                         \
+    tmp_flat4 = __msa_ori_b(zero_in, 1);                                 \
+    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                          \
+    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                          \
+    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                          \
+    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                          \
+                                                                         \
+    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);               \
+    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                     \
+    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);               \
+    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                     \
+                                                                         \
+    flat_out = (tmp_flat4 < (v16u8)flat_out);                            \
+    flat_out = __msa_xori_b(flat_out, 0xff);                             \
+    flat_out = flat_out & (mask);                                        \
   }
 
 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
                   q6_in, q7_in, flat_in, flat2_out)                       \
   {                                                                       \
-    v16u8 tmp, zero_in = { 0 };                                           \
+    v16u8 tmp_flat5, zero_in = { 0 };                                     \
     v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;             \
     v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;             \
                                                                           \
-    tmp = __msa_ori_b(zero_in, 1);                                        \
+    tmp_flat5 = __msa_ori_b(zero_in, 1);                                  \
     p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                           \
     q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                           \
     p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                           \
@@ -168,7 +95,7 @@
     p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);                \
     flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);                    \
                                                                           \
-    flat2_out = (tmp < (v16u8)flat2_out);                                 \
+    flat2_out = (tmp_flat5 < (v16u8)flat2_out);                           \
     flat2_out = __msa_xori_b(flat2_out, 0xff);                            \
     flat2_out = flat2_out & flat_in;                                      \
   }
@@ -177,38 +104,38 @@
                     p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
                     q1_filt8_out, q2_filt8_out)                             \
   {                                                                         \
-    v8u16 tmp0, tmp1, tmp2;                                                 \
+    v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                            \
                                                                             \
-    tmp2 = p2_in + p1_in + p0_in;                                           \
-    tmp0 = p3_in << 1;                                                      \
+    tmp_filt8_2 = p2_in + p1_in + p0_in;                                    \
+    tmp_filt8_0 = p3_in << 1;                                               \
                                                                             \
-    tmp0 = tmp0 + tmp2 + q0_in;                                             \
-    tmp1 = tmp0 + p3_in + p2_in;                                            \
-    p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in;                        \
+    tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in;                              \
+    p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
                                                                             \
-    tmp1 = tmp0 + p1_in + q1_in;                                            \
-    p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in;                              \
+    p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
                                                                             \
-    tmp1 = q2_in + q1_in + q0_in;                                           \
-    tmp2 = tmp2 + tmp1;                                                     \
-    tmp0 = tmp2 + (p0_in);                                                  \
-    tmp0 = tmp0 + (p3_in);                                                  \
-    p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3);                    \
+    tmp_filt8_1 = q2_in + q1_in + q0_in;                                    \
+    tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1;                                \
+    tmp_filt8_0 = tmp_filt8_2 + (p0_in);                                    \
+    tmp_filt8_0 = tmp_filt8_0 + (p3_in);                                    \
+    p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3);             \
                                                                             \
-    tmp0 = q2_in + q3_in;                                                   \
-    tmp0 = p0_in + tmp1 + tmp0;                                             \
-    tmp1 = q3_in + q3_in;                                                   \
-    tmp1 = tmp1 + tmp0;                                                     \
-    q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_0 = q2_in + q3_in;                                            \
+    tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0;                        \
+    tmp_filt8_1 = q3_in + q3_in;                                            \
+    tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0;                                \
+    q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
                                                                             \
-    tmp0 = tmp2 + q3_in;                                                    \
-    tmp1 = tmp0 + q0_in;                                                    \
-    q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_0 = tmp_filt8_2 + q3_in;                                      \
+    tmp_filt8_1 = tmp_filt8_0 + q0_in;                                      \
+    q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
                                                                             \
-    tmp1 = tmp0 - p2_in;                                                    \
-    tmp0 = q1_in + q3_in;                                                   \
-    tmp1 = tmp0 + tmp1;                                                     \
-    q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+    tmp_filt8_1 = tmp_filt8_0 - p2_in;                                      \
+    tmp_filt8_0 = q1_in + q3_in;                                            \
+    tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1;                                \
+    q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
   }
 
 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
diff --git a/libs/libvpx/vpx_dsp/mips/macros_msa.h b/libs/libvpx/vpx_dsp/mips/macros_msa.h
index f498fbe9de..f9a446e7bc 100644
--- a/libs/libvpx/vpx_dsp/mips/macros_msa.h
+++ b/libs/libvpx/vpx_dsp/mips/macros_msa.h
@@ -16,207 +16,149 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
-#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
+#define LD_V(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
+#define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
+#define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
 
-#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
-#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
-
-#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
-
-#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
-#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
-
-#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
-
-#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
+#define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
+#define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
 
 #if (__mips_isa_rev >= 6)
-#define LH(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint16_t val_m;                                       \
-                                                          \
-    __asm__ __volatile__("lh  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
+#define LH(psrc)                                   \
+  ({                                               \
+    uint16_t val_lh_m = *(const uint16_t *)(psrc); \
+    val_lh_m;                                      \
   })
 
-#define LW(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint32_t val_m;                                       \
-                                                          \
-    __asm__ __volatile__("lw  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
+#define LW(psrc)                                   \
+  ({                                               \
+    uint32_t val_lw_m = *(const uint32_t *)(psrc); \
+    val_lw_m;                                      \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint64_t val_m = 0;                                   \
-                                                          \
-    __asm__ __volatile__("ld  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
+#define LD(psrc)                                   \
+  ({                                               \
+    uint64_t val_ld_m = *(const uint64_t *)(psrc); \
+    val_ld_m;                                      \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m);                                    \
-    val1_m = LW(psrc_m + 4);                                \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);           \
+    uint32_t val0_ld_m, val1_ld_m;                                \
+    uint64_t val_ld_m = 0;                                        \
+                                                                  \
+    val0_ld_m = LW(psrc_ld_m);                                    \
+    val1_ld_m = LW(psrc_ld_m + 4);                                \
+                                                                  \
+    val_ld_m = (uint64_t)(val1_ld_m);                             \
+    val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \
+    val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m);        \
+                                                                  \
+    val_ld_m;                                                     \
   })
 #endif  // (__mips == 64)
 
-#define SH(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint16_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sh  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
-
-#define SW(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint32_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sw  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
-
-#define SD(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint64_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sd  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
+#define SH(val, pdst) *(uint16_t *)(pdst) = (val);
+#define SW(val, pdst) *(uint32_t *)(pdst) = (val);
+#define SD(val, pdst) *(uint64_t *)(pdst) = (val);
 #else  // !(__mips_isa_rev >= 6)
-#define LH(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint16_t val_m;                                        \
-                                                           \
-    __asm__ __volatile__("ulh  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
+#define LH(psrc)                                                 \
+  ({                                                             \
+    const uint8_t *psrc_lh_m = (const uint8_t *)(psrc);          \
+    uint16_t val_lh_m;                                           \
+                                                                 \
+    __asm__ __volatile__("ulh  %[val_lh_m],  %[psrc_lh_m]  \n\t" \
+                                                                 \
+                         : [val_lh_m] "=r"(val_lh_m)             \
+                         : [psrc_lh_m] "m"(*psrc_lh_m));         \
+                                                                 \
+    val_lh_m;                                                    \
   })
 
-#define LW(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint32_t val_m;                                        \
-                                                           \
-    __asm__ __volatile__("ulw  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
+#define LW(psrc)                                                 \
+  ({                                                             \
+    const uint8_t *psrc_lw_m = (const uint8_t *)(psrc);          \
+    uint32_t val_lw_m;                                           \
+                                                                 \
+    __asm__ __volatile__("ulw  %[val_lw_m],  %[psrc_lw_m]  \n\t" \
+                                                                 \
+                         : [val_lw_m] "=r"(val_lw_m)             \
+                         : [psrc_lw_m] "m"(*psrc_lw_m));         \
+                                                                 \
+    val_lw_m;                                                    \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint64_t val_m = 0;                                    \
-                                                           \
-    __asm__ __volatile__("uld  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
+#define LD(psrc)                                                 \
+  ({                                                             \
+    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);          \
+    uint64_t val_ld_m = 0;                                       \
+                                                                 \
+    __asm__ __volatile__("uld  %[val_ld_m],  %[psrc_ld_m]  \n\t" \
+                                                                 \
+                         : [val_ld_m] "=r"(val_ld_m)             \
+                         : [psrc_ld_m] "m"(*psrc_ld_m));         \
+                                                                 \
+    val_ld_m;                                                    \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);       \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m1);                                   \
-    val1_m = LW(psrc_m1 + 4);                               \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);           \
+    uint32_t val0_ld_m, val1_ld_m;                                \
+    uint64_t val_ld_m = 0;                                        \
+                                                                  \
+    val0_ld_m = LW(psrc_ld_m);                                    \
+    val1_ld_m = LW(psrc_ld_m + 4);                                \
+                                                                  \
+    val_ld_m = (uint64_t)(val1_ld_m);                             \
+    val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \
+    val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m);        \
+                                                                  \
+    val_ld_m;                                                     \
   })
 #endif  // (__mips == 64)
 
-#define SH(val, pdst)                                      \
-  {                                                        \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
-    const uint16_t val_m = (val);                          \
-                                                           \
-    __asm__ __volatile__("ush  %[val_m],  %[pdst_m]  \n\t" \
-                                                           \
-                         : [pdst_m] "=m"(*pdst_m)          \
-                         : [val_m] "r"(val_m));            \
+#define SH(val, pdst)                                            \
+  {                                                              \
+    uint8_t *pdst_sh_m = (uint8_t *)(pdst);                      \
+    const uint16_t val_sh_m = (val);                             \
+                                                                 \
+    __asm__ __volatile__("ush  %[val_sh_m],  %[pdst_sh_m]  \n\t" \
+                                                                 \
+                         : [pdst_sh_m] "=m"(*pdst_sh_m)          \
+                         : [val_sh_m] "r"(val_sh_m));            \
   }
 
-#define SW(val, pdst)                                      \
-  {                                                        \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
-    const uint32_t val_m = (val);                          \
-                                                           \
-    __asm__ __volatile__("usw  %[val_m],  %[pdst_m]  \n\t" \
-                                                           \
-                         : [pdst_m] "=m"(*pdst_m)          \
-                         : [val_m] "r"(val_m));            \
+#define SW(val, pdst)                                            \
+  {                                                              \
+    uint8_t *pdst_sw_m = (uint8_t *)(pdst);                      \
+    const uint32_t val_sw_m = (val);                             \
+                                                                 \
+    __asm__ __volatile__("usw  %[val_sw_m],  %[pdst_sw_m]  \n\t" \
+                                                                 \
+                         : [pdst_sw_m] "=m"(*pdst_sw_m)          \
+                         : [val_sw_m] "r"(val_sw_m));            \
   }
 
-#define SD(val, pdst)                                        \
-  {                                                          \
-    uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
-    uint32_t val0_m, val1_m;                                 \
-                                                             \
-    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
-    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
-                                                             \
-    SW(val0_m, pdst_m1);                                     \
-    SW(val1_m, pdst_m1 + 4);                                 \
+#define SD(val, pdst)                                           \
+  {                                                             \
+    uint8_t *pdst_sd_m = (uint8_t *)(pdst);                     \
+    uint32_t val0_sd_m, val1_sd_m;                              \
+                                                                \
+    val0_sd_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    val1_sd_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+                                                                \
+    SW(val0_sd_m, pdst_sd_m);                                   \
+    SW(val1_sd_m, pdst_sd_m + 4);                               \
   }
 #endif  // (__mips_isa_rev >= 6)
 
@@ -283,97 +225,73 @@
     SD(in3, (pdst) + 3 * stride);             \
   }
 
-/* Description : Load vectors with 16 byte elements with stride
+/* Description : Load vector elements with stride
    Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1
                  Return Type - as per RTYPE
    Details     : Load 16 byte elements in 'out0' from (psrc)
                  Load 16 byte elements in 'out1' from (psrc + stride)
 */
-#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+#define LD_V2(RTYPE, psrc, stride, out0, out1) \
   {                                            \
-    out0 = LD_B(RTYPE, (psrc));                \
-    out1 = LD_B(RTYPE, (psrc) + stride);       \
+    out0 = LD_V(RTYPE, (psrc));                \
+    out1 = LD_V(RTYPE, (psrc) + stride);       \
   }
-#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
-#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+#define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
+#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
+#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
 
-#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
+#define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
   {                                                  \
-    LD_B2(RTYPE, (psrc), stride, out0, out1);        \
-    out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
+    LD_V2(RTYPE, (psrc), stride, out0, out1);        \
+    out2 = LD_V(RTYPE, (psrc) + 2 * stride);         \
   }
-#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
+#define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
 
-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+#define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
   {                                                        \
-    LD_B2(RTYPE, (psrc), stride, out0, out1);              \
-    LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+    LD_V2(RTYPE, (psrc), stride, out0, out1);              \
+    LD_V2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
   }
-#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
-#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+#define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
+#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
 
-#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
+#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
   {                                                              \
-    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
-    out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
+    LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
+    out4 = LD_V(RTYPE, (psrc) + 4 * stride);                     \
   }
-#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
-#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
+#define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
 
-#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
+#define LD_V7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
   {                                                                          \
-    LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
-    LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
+    LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
+    LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
   }
-#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
+#define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
 
-#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+#define LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
               out7)                                                          \
   {                                                                          \
-    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
-    LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
+    LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
+    LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
   }
-#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
-#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
+#define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
+#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
 
-/* Description : Load vectors with 8 halfword elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-   Details     : Load 8 halfword elements in 'out0' from (psrc)
-                 Load 8 halfword elements in 'out1' from (psrc + stride)
-*/
-#define LD_H2(RTYPE, psrc, stride, out0, out1) \
-  {                                            \
-    out0 = LD_H(RTYPE, (psrc));                \
-    out1 = LD_H(RTYPE, (psrc) + (stride));     \
-  }
-#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
-
-#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
-  {                                                        \
-    LD_H2(RTYPE, (psrc), stride, out0, out1);              \
-    LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
-  }
-#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
-
-#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
-              out7)                                                          \
-  {                                                                          \
-    LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
-    LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
-  }
-#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
-
-#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
+#define LD_V16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
                out7, out8, out9, out10, out11, out12, out13, out14, out15)     \
   {                                                                            \
-    LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
+    LD_V8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
           out7);                                                               \
-    LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
+    LD_V8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
           out13, out14, out15);                                                \
   }
-#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
+#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
 
 /* Description : Load 4x4 block of signed halfword elements from 1D source
                  data into 4 vectors (Each vector with 4 signed halfwords)
@@ -388,79 +306,35 @@
     out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
   }
 
-/* Description : Load 2 vectors of signed word elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-                 Return Type - signed word
-*/
-#define LD_SW2(psrc, stride, out0, out1) \
-  {                                      \
-    out0 = LD_SW((psrc));                \
-    out1 = LD_SW((psrc) + stride);       \
-  }
-
-/* Description : Store vectors of 16 byte elements with stride
+/* Description : Store vectors with stride
    Arguments   : Inputs - in0, in1, pdst, stride
    Details     : Store 16 byte elements from 'in0' to (pdst)
                  Store 16 byte elements from 'in1' to (pdst + stride)
 */
-#define ST_B2(RTYPE, in0, in1, pdst, stride) \
+#define ST_V2(RTYPE, in0, in1, pdst, stride) \
   {                                          \
-    ST_B(RTYPE, in0, (pdst));                \
-    ST_B(RTYPE, in1, (pdst) + stride);       \
+    ST_V(RTYPE, in0, (pdst));                \
+    ST_V(RTYPE, in1, (pdst) + stride);       \
   }
-#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+#define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
+#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
+#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
 
-#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
+#define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
   {                                                      \
-    ST_B2(RTYPE, in0, in1, (pdst), stride);              \
-    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+    ST_V2(RTYPE, in0, in1, (pdst), stride);              \
+    ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
   }
-#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+#define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
+#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
 
-#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+#define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
   {                                                                        \
-    ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
-    ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
-  }
-#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
-
-/* Description : Store vectors of 8 halfword elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 8 halfword elements from 'in0' to (pdst)
-                 Store 8 halfword elements from 'in1' to (pdst + stride)
-*/
-#define ST_H2(RTYPE, in0, in1, pdst, stride) \
-  {                                          \
-    ST_H(RTYPE, in0, (pdst));                \
-    ST_H(RTYPE, in1, (pdst) + stride);       \
-  }
-#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
-
-#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
-  {                                                      \
-    ST_H2(RTYPE, in0, in1, (pdst), stride);              \
-    ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
-  }
-#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
-
-#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
-  {                                                                        \
-    ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                      \
-    ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
-  }
-#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
-
-/* Description : Store vectors of word elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 4 word elements from 'in0' to (pdst)
-                 Store 4 word elements from 'in1' to (pdst + stride)
-*/
-#define ST_SW2(in0, in1, pdst, stride) \
-  {                                    \
-    ST_SW(in0, (pdst));                \
-    ST_SW(in1, (pdst) + stride);       \
+    ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
+    ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
   }
+#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
+#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
 
 /* Description : Store 2x4 byte block to destination memory from input vector
    Arguments   : Inputs - in, stidx, pdst, stride
@@ -681,6 +555,7 @@
 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
 
 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
                 out3)                                                          \
@@ -909,27 +784,42 @@
     sum_m;                                         \
   })
 
-/* Description : Horizontal addition of 8 unsigned halfword elements
-   Arguments   : Inputs  - in       (unsigned halfword vector)
-                 Outputs - sum_m    (u32 sum)
-                 Return Type - unsigned word
-   Details     : 8 unsigned halfword elements of input vector are added
-                 together and the resulting integer sum is returned
+/* Description : Horizontal addition of 4 unsigned word elements
+   Arguments   : Input  - in       (unsigned word vector)
+                 Output - sum_m    (u32 sum)
+                 Return Type - unsigned word (GP)
+   Details     : 4 unsigned word elements of 'in' vector are added together and
+                 the resulting integer sum is returned
 */
-#define HADD_UH_U32(in)                               \
+#define HADD_UW_U32(in)                               \
   ({                                                  \
-    v4u32 res_m;                                      \
     v2u64 res0_m, res1_m;                             \
     uint32_t sum_m;                                   \
                                                       \
-    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);     \
-    res0_m = __msa_hadd_u_d(res_m, res_m);            \
+    res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);    \
     res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
-    res0_m = res0_m + res1_m;                         \
+    res0_m += res1_m;                                 \
     sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
     sum_m;                                            \
   })
 
+/* Description : Horizontal addition of 8 unsigned halfword elements
+   Arguments   : Input  - in       (unsigned halfword vector)
+                 Output - sum_m    (u32 sum)
+                 Return Type - unsigned word
+   Details     : 8 unsigned halfword elements of 'in' vector are added
+                 together and the resulting integer sum is returned
+*/
+#define HADD_UH_U32(in)                           \
+  ({                                              \
+    v4u32 res_m;                                  \
+    uint32_t sum_m;                               \
+                                                  \
+    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
+    sum_m = HADD_UW_U32(res_m);                   \
+    sum_m;                                        \
+  })
+
 /* Description : Horizontal addition of unsigned byte vector elements
    Arguments   : Inputs  - in0, in1
                  Outputs - out0, out1
@@ -1034,6 +924,7 @@
   }
 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
 
 /* Description : Interleave even byte elements from vectors
    Arguments   : Inputs  - in0, in1, in2, in3
@@ -1292,6 +1183,7 @@
     out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
   }
 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SB(...) ILVRL_W2(v16i8, __VA_ARGS__)
 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
 
@@ -1544,6 +1436,12 @@
    Details     : Each element of vector 'in0' is right shifted by 'shift' and
                  the result is written in-place. 'shift' is a GP variable.
 */
+#define SRA_2V(in0, in1, shift) \
+  {                             \
+    in0 = in0 >> shift;         \
+    in1 = in1 >> shift;         \
+  }
+
 #define SRA_4V(in0, in1, in2, in3, shift) \
   {                                       \
     in0 = in0 >> shift;                   \
@@ -1699,6 +1597,25 @@
     out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
   }
 
+/* Description : Sign extend byte elements from input vector and return
+                 halfword results in pair of vectors
+   Arguments   : Input   - in           (byte vector)
+                 Outputs - out0, out1   (sign extended halfword vectors)
+                 Return Type - signed halfword
+   Details     : Sign bit of byte elements from input vector 'in' is
+                 extracted and interleaved right with same vector 'in0' to
+                 generate 8 signed halfword elements in 'out0'
+                 Then interleaved left with same vector 'in0' to
+                 generate 8 signed halfword elements in 'out1'
+*/
+#define UNPCK_SB_SH(in, out0, out1)       \
+  {                                       \
+    v16i8 tmp_m;                          \
+                                          \
+    tmp_m = __msa_clti_s_b((v16i8)in, 0); \
+    ILVRL_B2_SH(tmp_m, in, out0, out1);   \
+  }
+
 /* Description : Zero extend unsigned byte elements to halfword elements
    Arguments   : Input   - in          (unsigned byte vector)
                  Outputs - out0, out1  (unsigned  halfword vectors)
@@ -1857,8 +1774,6 @@
     out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
                                                                               \
     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
-    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
-    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
     out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
     out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
@@ -2012,20 +1927,17 @@
 
 /* Description : Converts inputs to unsigned bytes, interleave, average & store
                  as 8x4 unsigned byte block
-   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
-                          pdst, stride
+   Arguments   : Inputs  - in0, in1, in2, in3, dst0, dst1, pdst, stride
 */
-#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
-                                pdst, stride)                               \
-  {                                                                         \
-    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                                    \
-                                                                            \
-    tmp0_m = PCKEV_XORI128_UB(in0, in1);                                    \
-    tmp1_m = PCKEV_XORI128_UB(in2, in3);                                    \
-    ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
-    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);            \
-    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                               \
+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \
+  {                                                                           \
+    v16u8 tmp0_m, tmp1_m;                                                     \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                                      \
+                                                                              \
+    tmp0_m = PCKEV_XORI128_UB(in0, in1);                                      \
+    tmp1_m = PCKEV_XORI128_UB(in2, in3);                                      \
+    AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);                  \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                                 \
   }
 
 /* Description : Pack even byte elements and store byte vector in destination
diff --git a/libs/libvpx/vpx_dsp/mips/sad_mmi.c b/libs/libvpx/vpx_dsp/mips/sad_mmi.c
new file mode 100644
index 0000000000..33bd3fe7f9
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/sad_mmi.c
@@ -0,0 +1,805 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#define SAD_SRC_REF_ABS_SUB_64                                      \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x27(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x20(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x2f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x28(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x27(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x20(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x2f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x28(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x37(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x30(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x3f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x38(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x37(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x30(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x3f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x38(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_32                                      \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_16                                      \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_8                                       \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+
+#if _MIPS_SIM == _ABIO32
+#define SAD_SRC_REF_ABS_SUB_4                                       \
+  "ulw        %[tmp0],    0x00(%[src])                        \n\t" \
+  "mtc1       %[tmp0],    %[ftmp1]                            \n\t" \
+  "ulw        %[tmp0],    0x00(%[ref])                        \n\t" \
+  "mtc1       %[tmp0],    %[ftmp2]                            \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "mthc1      $0,         %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
+#define SAD_SRC_REF_ABS_SUB_4                                       \
+  "gslwlc1    %[ftmp1],   0x03(%[src])                        \n\t" \
+  "gslwrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gslwlc1    %[ftmp2],   0x03(%[ref])                        \n\t" \
+  "gslwrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "mthc1      $0,         %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+#endif /* _MIPS_SIM == _ABIO32 */
+
+#define SAD_SRC_AVGREF_ABS_SUB_64                                   \
+  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x27(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x20(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x2f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x28(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x27(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x20(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x2f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x28(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x27(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x20(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x2f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x28(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x37(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x30(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x3f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x38(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x37(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x30(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x3f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x38(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x37(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x30(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x3f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x38(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_32                                   \
+  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_16                                   \
+  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_8                                    \
+  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
+  "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+
+#if _MIPS_SIM == _ABIO32
+#define SAD_SRC_AVGREF_ABS_SUB_4                                    \
+  "ulw        %[tmp0],    0x00(%[second_pred])                \n\t" \
+  "mtc1       %[tmp0],    %[ftmp1]                            \n\t" \
+  "ulw        %[tmp0],    0x00(%[ref])                        \n\t" \
+  "mtc1       %[tmp0],    %[ftmp2]                            \n\t" \
+  "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "mthc1      $0,         %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
+#define SAD_SRC_AVGREF_ABS_SUB_4                                    \
+  "gslwlc1    %[ftmp1],   0x03(%[second_pred])                \n\t" \
+  "gslwrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gslwlc1    %[ftmp2],   0x03(%[ref])                        \n\t" \
+  "gslwrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
+  "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "mthc1      $0,         %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+#endif /* _MIPS_SIM == _ABIO32 */
+
+// depending on call sites, pass **ref_array to avoid & in subsequent call and
+// de-dup with 4D below.
+#define sadMxNxK_mmi(m, n, k)                                                 \
+  void vpx_sad##m##x##n##x##k##_mmi(const uint8_t *src, int src_stride,       \
+                                    const uint8_t *ref_array, int ref_stride, \
+                                    uint32_t *sad_array) {                    \
+    int i;                                                                    \
+    for (i = 0; i < k; ++i)                                                   \
+      sad_array[i] =                                                          \
+          vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \
+  }
+
+// This appears to be equivalent to the above when k == 4 and refs is const
+#define sadMxNx4D_mmi(m, n)                                                  \
+  void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride,         \
+                                 const uint8_t *const ref_array[],           \
+                                 int ref_stride, uint32_t *sad_array) {      \
+    int i;                                                                   \
+    for (i = 0; i < 4; ++i)                                                  \
+      sad_array[i] =                                                         \
+          vpx_sad##m##x##n##_mmi(src, src_stride, ref_array[i], ref_stride); \
+  }
+
+static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_64
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_64
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad64xN(H)                                                   \
+  unsigned int vpx_sad64x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                   const uint8_t *ref, int ref_stride) { \
+    return vpx_sad64x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad64xN(64);
+vpx_sad64xN(32);
+sadMxNx4D_mmi(64, 64);
+sadMxNx4D_mmi(64, 32);
+
+static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred,
+                                          int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_64
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_64
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"((mips_reg)second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad_avg64xN(H)                                                   \
+  unsigned int vpx_sad64x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride,   \
+                                       const uint8_t *second_pred) {         \
+    return vpx_sad_avg64x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg64xN(64);
+vpx_sad_avg64xN(32);
+
+static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_32
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_32
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad32xN(H)                                                   \
+  unsigned int vpx_sad32x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                   const uint8_t *ref, int ref_stride) { \
+    return vpx_sad32x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad32xN(64);
+vpx_sad32xN(32);
+vpx_sad32xN(16);
+sadMxNx4D_mmi(32, 64);
+sadMxNx4D_mmi(32, 32);
+sadMxNx4D_mmi(32, 16);
+
+static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred,
+                                          int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_32
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_32
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"((mips_reg)second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad_avg32xN(H)                                                   \
+  unsigned int vpx_sad32x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride,   \
+                                       const uint8_t *second_pred) {         \
+    return vpx_sad_avg32x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg32xN(64);
+vpx_sad_avg32xN(32);
+vpx_sad_avg32xN(16);
+
+static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_16
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_16
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad16xN(H)                                                   \
+  unsigned int vpx_sad16x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                   const uint8_t *ref, int ref_stride) { \
+    return vpx_sad16x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad16xN(32);
+vpx_sad16xN(16);
+vpx_sad16xN(8);
+sadMxNxK_mmi(16, 16, 3);
+sadMxNxK_mmi(16, 16, 8);
+sadMxNxK_mmi(16, 8, 3);
+sadMxNxK_mmi(16, 8, 8);
+sadMxNx4D_mmi(16, 32);
+sadMxNx4D_mmi(16, 16);
+sadMxNx4D_mmi(16, 8);
+
+static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred,
+                                          int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_16
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_16
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"((mips_reg)second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad_avg16xN(H)                                                   \
+  unsigned int vpx_sad16x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride,   \
+                                       const uint8_t *second_pred) {         \
+    return vpx_sad_avg16x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg16xN(32);
+vpx_sad_avg16xN(16);
+vpx_sad_avg16xN(8);
+
+static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_8
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_8
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp3]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad8xN(H)                                                   \
+  unsigned int vpx_sad8x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                  const uint8_t *ref, int ref_stride) { \
+    return vpx_sad8x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad8xN(16);
+vpx_sad8xN(8);
+vpx_sad8xN(4);
+sadMxNxK_mmi(8, 16, 3);
+sadMxNxK_mmi(8, 16, 8);
+sadMxNxK_mmi(8, 8, 3);
+sadMxNxK_mmi(8, 8, 8);
+sadMxNx4D_mmi(8, 16);
+sadMxNx4D_mmi(8, 8);
+sadMxNx4D_mmi(8, 4);
+
+static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred,
+                                         int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_8
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_8
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp3]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"((mips_reg)second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad_avg8xN(H)                                                   \
+  unsigned int vpx_sad8x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                      const uint8_t *ref, int ref_stride,   \
+                                      const uint8_t *second_pred) {         \
+    return vpx_sad_avg8x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg8xN(16);
+vpx_sad_avg8xN(8);
+vpx_sad_avg8xN(4);
+
+static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_4
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_4
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp3]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad4xN(H)                                                   \
+  unsigned int vpx_sad4x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                  const uint8_t *ref, int ref_stride) { \
+    return vpx_sad4x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad4xN(8);
+vpx_sad4xN(4);
+sadMxNxK_mmi(4, 4, 3);
+sadMxNxK_mmi(4, 4, 8);
+sadMxNx4D_mmi(4, 8);
+sadMxNx4D_mmi(4, 4);
+
+static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred,
+                                         int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_4
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_4
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp3]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"((mips_reg)second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad_avg4xN(H)                                                   \
+  unsigned int vpx_sad4x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                      const uint8_t *ref, int ref_stride,   \
+                                      const uint8_t *second_pred) {         \
+    return vpx_sad_avg4x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg4xN(8);
+vpx_sad_avg4xN(4);
diff --git a/libs/libvpx/vpx_dsp/mips/sad_msa.c b/libs/libvpx/vpx_dsp/mips/sad_msa.c
index 6455814e1b..ab681ae9f8 100644
--- a/libs/libvpx/vpx_dsp/mips/sad_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/sad_msa.c
@@ -283,96 +283,6 @@ static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
   sad_array[2] = HADD_UH_U32(sad2);
 }
 
-static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = height >> 1; ht_cnt--;) {
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
-  v8u16 sad0_0 = { 0 };
-  v8u16 sad0_1 = { 0 };
-  v8u16 sad1_0 = { 0 };
-  v8u16 sad1_1 = { 0 };
-  v8u16 sad2_0 = { 0 };
-  v8u16 sad2_1 = { 0 };
-  v4u32 sad;
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
-    ref0_4 = LD_UB(ref + 64);
-    ref += ref_stride;
-
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[0] = HADD_SW_S32((v4i32)sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[1] = HADD_SW_S32((v4i32)sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[2] = HADD_SW_S32((v4i32)sad);
-}
-
 static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
                               const uint8_t *ref_ptr, int32_t ref_stride,
                               int32_t height, uint32_t *sad_array) {
@@ -623,176 +533,6 @@ static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
   sad_array[7] = HADD_UH_U32(sad7);
 }
 
-static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1;
-  v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
-    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
-    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
-    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
-    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  const uint8_t *src_dup, *ref_dup;
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8u16 sad0_0 = { 0 };
-  v8u16 sad0_1 = { 0 };
-  v8u16 sad1_0 = { 0 };
-  v8u16 sad1_1 = { 0 };
-  v8u16 sad2_0 = { 0 };
-  v8u16 sad2_1 = { 0 };
-  v8u16 sad3_0 = { 0 };
-  v8u16 sad3_1 = { 0 };
-  v4u32 sad;
-
-  src_dup = src;
-  ref_dup = ref;
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
-    ref += ref_stride;
-
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
-    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[0] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[1] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[2] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad3_0, sad3_0);
-  sad += __msa_hadd_u_w(sad3_1, sad3_1);
-  sad_array[3] = HADD_SW_S32(sad);
-
-  sad0_0 = (v8u16)__msa_ldi_h(0);
-  sad0_1 = (v8u16)__msa_ldi_h(0);
-  sad1_0 = (v8u16)__msa_ldi_h(0);
-  sad1_1 = (v8u16)__msa_ldi_h(0);
-  sad2_0 = (v8u16)__msa_ldi_h(0);
-  sad2_1 = (v8u16)__msa_ldi_h(0);
-  sad3_0 = (v8u16)__msa_ldi_h(0);
-  sad3_1 = (v8u16)__msa_ldi_h(0);
-
-  for (ht_cnt = 64; ht_cnt--;) {
-    LD_UB4(src_dup, 16, src0, src1, src2, src3);
-    src_dup += src_stride;
-    LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
-    ref_dup += ref_stride;
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
-    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[4] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[5] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[6] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad3_0, sad3_0);
-  sad += __msa_hadd_u_w(sad3_1, sad3_1);
-  sad_array[7] = HADD_SW_S32(sad);
-}
-
 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
                                const uint8_t *const aref_ptr[],
                                int32_t ref_stride, int32_t height,
@@ -1030,6 +770,7 @@ static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
   v8u16 sad2_1 = { 0 };
   v8u16 sad3_0 = { 0 };
   v8u16 sad3_1 = { 0 };
+  v4u32 sad;
 
   ref0_ptr = aref_ptr[0];
   ref1_ptr = aref_ptr[1];
@@ -1061,14 +802,21 @@ static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
     sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
   }
 
-  sad_array[0] = HADD_UH_U32(sad0_0);
-  sad_array[0] += HADD_UH_U32(sad0_1);
-  sad_array[1] = HADD_UH_U32(sad1_0);
-  sad_array[1] += HADD_UH_U32(sad1_1);
-  sad_array[2] = HADD_UH_U32(sad2_0);
-  sad_array[2] += HADD_UH_U32(sad2_1);
-  sad_array[3] = HADD_UH_U32(sad3_0);
-  sad_array[3] += HADD_UH_U32(sad3_1);
+  sad = __msa_hadd_u_w(sad0_0, sad0_0);
+  sad += __msa_hadd_u_w(sad0_1, sad0_1);
+  sad_array[0] = HADD_UW_U32(sad);
+
+  sad = __msa_hadd_u_w(sad1_0, sad1_0);
+  sad += __msa_hadd_u_w(sad1_1, sad1_1);
+  sad_array[1] = HADD_UW_U32(sad);
+
+  sad = __msa_hadd_u_w(sad2_0, sad2_0);
+  sad += __msa_hadd_u_w(sad2_1, sad2_1);
+  sad_array[2] = HADD_UW_U32(sad);
+
+  sad = __msa_hadd_u_w(sad3_0, sad3_0);
+  sad += __msa_hadd_u_w(sad3_1, sad3_1);
+  sad_array[3] = HADD_UW_U32(sad);
 }
 
 static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
@@ -1310,20 +1058,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
     sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
   }
 
-#define VPX_SAD_32xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_64xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
 #define VPX_SAD_4xHEIGHTx8_MSA(height)                                   \
   void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
                                  const uint8_t *ref, int32_t ref_stride, \
@@ -1345,20 +1079,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
     sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
   }
 
-#define VPX_SAD_32xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_64xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
 #define VPX_SAD_4xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
                                   const uint8_t *const refs[],            \
@@ -1436,43 +1156,31 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
 
 // 64x64
 VPX_SAD_64xHEIGHT_MSA(64);
-VPX_SAD_64xHEIGHTx3_MSA(64);
-VPX_SAD_64xHEIGHTx8_MSA(64);
 VPX_SAD_64xHEIGHTx4D_MSA(64);
 VPX_AVGSAD_64xHEIGHT_MSA(64);
 
 // 64x32
 VPX_SAD_64xHEIGHT_MSA(32);
-VPX_SAD_64xHEIGHTx3_MSA(32);
-VPX_SAD_64xHEIGHTx8_MSA(32);
 VPX_SAD_64xHEIGHTx4D_MSA(32);
 VPX_AVGSAD_64xHEIGHT_MSA(32);
 
 // 32x64
 VPX_SAD_32xHEIGHT_MSA(64);
-VPX_SAD_32xHEIGHTx3_MSA(64);
-VPX_SAD_32xHEIGHTx8_MSA(64);
 VPX_SAD_32xHEIGHTx4D_MSA(64);
 VPX_AVGSAD_32xHEIGHT_MSA(64);
 
 // 32x32
 VPX_SAD_32xHEIGHT_MSA(32);
-VPX_SAD_32xHEIGHTx3_MSA(32);
-VPX_SAD_32xHEIGHTx8_MSA(32);
 VPX_SAD_32xHEIGHTx4D_MSA(32);
 VPX_AVGSAD_32xHEIGHT_MSA(32);
 
 // 32x16
 VPX_SAD_32xHEIGHT_MSA(16);
-VPX_SAD_32xHEIGHTx3_MSA(16);
-VPX_SAD_32xHEIGHTx8_MSA(16);
 VPX_SAD_32xHEIGHTx4D_MSA(16);
 VPX_AVGSAD_32xHEIGHT_MSA(16);
 
 // 16x32
 VPX_SAD_16xHEIGHT_MSA(32);
-VPX_SAD_16xHEIGHTx3_MSA(32);
-VPX_SAD_16xHEIGHTx8_MSA(32);
 VPX_SAD_16xHEIGHTx4D_MSA(32);
 VPX_AVGSAD_16xHEIGHT_MSA(32);
 
@@ -1506,15 +1214,11 @@ VPX_AVGSAD_8xHEIGHT_MSA(8);
 
 // 8x4
 VPX_SAD_8xHEIGHT_MSA(4);
-VPX_SAD_8xHEIGHTx3_MSA(4);
-VPX_SAD_8xHEIGHTx8_MSA(4);
 VPX_SAD_8xHEIGHTx4D_MSA(4);
 VPX_AVGSAD_8xHEIGHT_MSA(4);
 
 // 4x8
 VPX_SAD_4xHEIGHT_MSA(8);
-VPX_SAD_4xHEIGHTx3_MSA(8);
-VPX_SAD_4xHEIGHTx8_MSA(8);
 VPX_SAD_4xHEIGHTx4D_MSA(8);
 VPX_AVGSAD_4xHEIGHT_MSA(8);
 
diff --git a/libs/libvpx/vpx_dsp/mips/subtract_mmi.c b/libs/libvpx/vpx_dsp/mips/subtract_mmi.c
new file mode 100644
index 0000000000..9f361704a8
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/subtract_mmi.c
@@ -0,0 +1,306 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff,
+                            ptrdiff_t diff_stride, const uint8_t *src,
+                            ptrdiff_t src_stride, const uint8_t *pred,
+                            ptrdiff_t pred_stride) {
+  double ftmp[13];
+  uint32_t tmp[1];
+
+  if (rows == cols) {
+    switch (rows) {
+      case 4:
+        __asm__ volatile(
+            "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
+#if _MIPS_SIM == _ABIO32
+            "ulw        %[tmp0],    0x00(%[src])                        \n\t"
+            "mtc1       %[tmp0],    %[ftmp1]                            \n\t"
+            "ulw        %[tmp0],    0x00(%[pred])                       \n\t"
+            "mtc1       %[tmp0],    %[ftmp2]                            \n\t"
+#else
+            "gslwlc1    %[ftmp1],   0x03(%[src])                        \n\t"
+            "gslwrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "gslwlc1    %[ftmp2],   0x03(%[pred])                       \n\t"
+            "gslwrc1    %[ftmp2],   0x00(%[pred])                       \n\t"
+#endif
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+            "ulw        %[tmp0],    0x00(%[src])                        \n\t"
+            "mtc1       %[tmp0],    %[ftmp3]                            \n\t"
+            "ulw        %[tmp0],    0x00(%[pred])                       \n\t"
+            "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+#else
+            "gslwlc1    %[ftmp3],   0x03(%[src])                        \n\t"
+            "gslwrc1    %[ftmp3],   0x00(%[src])                        \n\t"
+            "gslwlc1    %[ftmp4],   0x03(%[pred])                       \n\t"
+            "gslwrc1    %[ftmp4],   0x00(%[pred])                       \n\t"
+#endif
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+            "ulw        %[tmp0],    0x00(%[src])                        \n\t"
+            "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
+            "ulw        %[tmp0],    0x00(%[pred])                       \n\t"
+            "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
+#else
+            "gslwlc1    %[ftmp5],   0x03(%[src])                        \n\t"
+            "gslwrc1    %[ftmp5],   0x00(%[src])                        \n\t"
+            "gslwlc1    %[ftmp6],   0x03(%[pred])                       \n\t"
+            "gslwrc1    %[ftmp6],   0x00(%[pred])                       \n\t"
+#endif
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+            "ulw        %[tmp0],    0x00(%[src])                        \n\t"
+            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+            "ulw        %[tmp0],    0x00(%[pred])                       \n\t"
+            "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
+#else
+            "gslwlc1    %[ftmp7],   0x03(%[src])                        \n\t"
+            "gslwrc1    %[ftmp7],   0x00(%[src])                        \n\t"
+            "gslwlc1    %[ftmp8],   0x03(%[pred])                       \n\t"
+            "gslwrc1    %[ftmp8],   0x00(%[pred])                       \n\t"
+#endif
+            "punpcklbh  %[ftmp9],   %[ftmp1],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp10],  %[ftmp2],           %[ftmp0]        \n\t"
+            "psubh      %[ftmp11],  %[ftmp9],           %[ftmp10]       \n\t"
+            "gssdlc1    %[ftmp11],  0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp11],  0x00(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp3],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp10],  %[ftmp4],           %[ftmp0]        \n\t"
+            "psubh      %[ftmp11],  %[ftmp9],           %[ftmp10]       \n\t"
+            "gssdlc1    %[ftmp11],  0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp11],  0x00(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp5],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp10],  %[ftmp6],           %[ftmp0]        \n\t"
+            "psubh      %[ftmp11],  %[ftmp9],           %[ftmp10]       \n\t"
+            "gssdlc1    %[ftmp11],  0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp11],  0x00(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp10],  %[ftmp8],           %[ftmp0]        \n\t"
+            "psubh      %[ftmp11],  %[ftmp9],           %[ftmp10]       \n\t"
+            "gssdlc1    %[ftmp11],  0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp11],  0x00(%[diff])                       \n\t"
+            : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+              [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+              [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+              [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+              [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+              [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+#if _MIPS_SIM == _ABIO32
+              [tmp0] "=&r"(tmp[0]),
+#endif
+              [src] "+&r"(src), [pred] "+&r"(pred), [diff] "+&r"(diff)
+            : [src_stride] "r"((mips_reg)src_stride),
+              [pred_stride] "r"((mips_reg)pred_stride),
+              [diff_stride] "r"((mips_reg)(diff_stride * 2))
+            : "memory");
+        break;
+      case 8:
+        __asm__ volatile(
+            "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
+            "li         %[tmp0],    0x02                                \n\t"
+            "1:                                                         \n\t"
+            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp2],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp2],   0x00(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "gsldlc1    %[ftmp3],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp3],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp4],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp4],   0x00(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "gsldlc1    %[ftmp5],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp5],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp6],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp6],   0x00(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "gsldlc1    %[ftmp7],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp7],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp8],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp8],   0x00(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp1],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp1],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp2],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp2],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp3],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp3],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp4],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp4],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp5],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp5],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp6],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp6],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp8],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp8],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "addiu      %[tmp0],    %[tmp0],            -0x01           \n\t"
+            "bnez       %[tmp0],    1b                                  \n\t"
+            : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+              [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+              [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+              [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+              [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+              [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+              [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src),
+              [pred] "+&r"(pred), [diff] "+&r"(diff)
+            : [pred_stride] "r"((mips_reg)pred_stride),
+              [src_stride] "r"((mips_reg)src_stride),
+              [diff_stride] "r"((mips_reg)(diff_stride * 2))
+            : "memory");
+        break;
+      case 16:
+        __asm__ volatile(
+            "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
+            "li         %[tmp0],    0x08                                \n\t"
+            "1:                                                         \n\t"
+            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp2],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp2],   0x00(%[pred])                       \n\t"
+            "gsldlc1    %[ftmp3],   0x0f(%[src])                        \n\t"
+            "gsldrc1    %[ftmp3],   0x08(%[src])                        \n\t"
+            "gsldlc1    %[ftmp4],   0x0f(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp4],   0x08(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "gsldlc1    %[ftmp5],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp5],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp6],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp6],   0x00(%[pred])                       \n\t"
+            "gsldlc1    %[ftmp7],   0x0f(%[src])                        \n\t"
+            "gsldrc1    %[ftmp7],   0x08(%[src])                        \n\t"
+            "gsldlc1    %[ftmp8],   0x0f(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp8],   0x08(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp1],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp1],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp2],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp2],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            "punpcklbh  %[ftmp9],   %[ftmp3],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp3],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp4],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp4],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x17(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x10(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x1f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x18(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp5],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp5],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp6],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp6],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp8],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp8],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x17(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x10(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x1f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x18(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "addiu      %[tmp0],    %[tmp0],            -0x01           \n\t"
+            "bnez       %[tmp0],    1b                                  \n\t"
+            : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+              [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+              [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+              [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+              [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+              [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+              [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src),
+              [pred] "+&r"(pred), [diff] "+&r"(diff)
+            : [pred_stride] "r"((mips_reg)pred_stride),
+              [src_stride] "r"((mips_reg)src_stride),
+              [diff_stride] "r"((mips_reg)(diff_stride * 2))
+            : "memory");
+        break;
+      case 32:
+        vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+                             pred, pred_stride);
+        break;
+      case 64:
+        vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+                             pred, pred_stride);
+        break;
+      default:
+        vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+                             pred, pred_stride);
+        break;
+    }
+  } else {
+    vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, pred,
+                         pred_stride);
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/mips/sum_squares_msa.c b/libs/libvpx/vpx_dsp/mips/sum_squares_msa.c
new file mode 100644
index 0000000000..d4563dc410
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/sum_squares_msa.c
@@ -0,0 +1,129 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "./macros_msa.h"
+
+uint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int src_stride,
+                                    int size) {
+  int row, col;
+  uint64_t ss_res = 0;
+  v4i32 mul0, mul1;
+  v2i64 res0 = { 0 };
+
+  if (4 == size) {
+    uint64_t src0, src1, src2, src3;
+    v8i16 diff0 = { 0 };
+    v8i16 diff1 = { 0 };
+
+    LD4(src, src_stride, src0, src1, src2, src3);
+    INSERT_D2_SH(src0, src1, diff0);
+    INSERT_D2_SH(src2, src3, diff1);
+    DOTP_SH2_SW(diff0, diff1, diff0, diff1, mul0, mul1);
+    mul0 += mul1;
+    res0 = __msa_hadd_s_d(mul0, mul0);
+    res0 += __msa_splati_d(res0, 1);
+    ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+  } else if (8 == size) {
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+    mul0 += mul1;
+    res0 = __msa_hadd_s_d(mul0, mul0);
+    res0 += __msa_splati_d(res0, 1);
+    ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+  } else if (16 == size) {
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+    LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += 8 * src_stride;
+    DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+    LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+    LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+    mul0 += mul1;
+    res0 += __msa_hadd_s_d(mul0, mul0);
+
+    res0 += __msa_splati_d(res0, 1);
+    ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+  } else if (0 == (size % 16)) {
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (row = 0; row < (size >> 4); row++) {
+      for (col = 0; col < size; col += 16) {
+        const int16_t *src_ptr = src + col;
+        LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
+               src7);
+        DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+        DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+        DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+        DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+        LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
+               src6, src7);
+        src_ptr += 8 * src_stride;
+        DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+        DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+        DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+        DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+        LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
+               src7);
+        DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+        DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+        DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+        DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+        LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
+               src6, src7);
+        DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+        DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+        DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+        DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+        mul0 += mul1;
+        res0 += __msa_hadd_s_d(mul0, mul0);
+      }
+
+      src += 16 * src_stride;
+    }
+
+    res0 += __msa_splati_d(res0, 1);
+    ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+  } else {
+    int16_t val;
+
+    for (row = 0; row < size; row++) {
+      for (col = 0; col < size; col++) {
+        val = src[col];
+        ss_res += val * val;
+      }
+
+      src += src_stride;
+    }
+  }
+
+  return ss_res;
+}
diff --git a/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h b/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h
index da100f6a98..f077fa4814 100644
--- a/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h
+++ b/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h
@@ -15,19 +15,24 @@
 
 #define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
   {                                                           \
-    v8i16 k0_m = __msa_fill_h(cnst0);                         \
-    v4i32 s0_m, s1_m, s2_m, s3_m;                             \
+    v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m;                 \
+    v8i16 k0_m, k1_m, k2_m, zero = { 0 };                     \
                                                               \
-    s0_m = (v4i32)__msa_fill_h(cnst1);                        \
-    k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m);                  \
+    k0_m = __msa_fill_h(cnst0);                               \
+    k1_m = __msa_fill_h(cnst1);                               \
+    k2_m = __msa_ilvev_h((v8i16)k1_m, k0_m);                  \
+    k0_m = __msa_ilvev_h((v8i16)zero, k0_m);                  \
+    k1_m = __msa_ilvev_h(k1_m, (v8i16)zero);                  \
                                                               \
-    ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m);                   \
+    ILVRL_H2_SW(reg1, reg0, s5_m, s4_m);                      \
     ILVRL_H2_SW(reg0, reg1, s3_m, s2_m);                      \
-    DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m);          \
+    DOTP_SH2_SW(s5_m, s4_m, k0_m, k0_m, s1_m, s0_m);          \
+    s1_m = __msa_dpsub_s_w(s1_m, (v8i16)s5_m, k1_m);          \
+    s0_m = __msa_dpsub_s_w(s0_m, (v8i16)s4_m, k1_m);          \
     SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                  \
     out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);           \
                                                               \
-    DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m);          \
+    DOTP_SH2_SW(s3_m, s2_m, k2_m, k2_m, s1_m, s0_m);          \
     SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                  \
     out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);           \
   }
diff --git a/libs/libvpx/vpx_dsp/mips/variance_mmi.c b/libs/libvpx/vpx_dsp/mips/variance_mmi.c
new file mode 100644
index 0000000000..4af60d3634
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/variance_mmi.c
@@ -0,0 +1,1280 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/variance.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+/* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32,
+   vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */
+#define VARIANCE_SSE_SUM_8_FOR_W64                                  \
+  /* sse */                                                         \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp10],  %[ftmp10],      %[ftmp6]            \n\t" \
+  "paddw      %[ftmp10],  %[ftmp10],      %[ftmp7]            \n\t" \
+                                                                    \
+  /* sum */                                                         \
+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "punpcklhw  %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhhw  %[ftmp2],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpcklhw  %[ftmp7],   %[ftmp5],       %[ftmp0]            \n\t" \
+  "punpckhhw  %[ftmp8],   %[ftmp5],       %[ftmp0]            \n\t" \
+  "psubw      %[ftmp3],   %[ftmp1],       %[ftmp7]            \n\t" \
+  "psubw      %[ftmp5],   %[ftmp2],       %[ftmp8]            \n\t" \
+  "punpcklhw  %[ftmp1],   %[ftmp4],       %[ftmp0]            \n\t" \
+  "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp0]            \n\t" \
+  "punpcklhw  %[ftmp7],   %[ftmp6],       %[ftmp0]            \n\t" \
+  "punpckhhw  %[ftmp8],   %[ftmp6],       %[ftmp0]            \n\t" \
+  "psubw      %[ftmp4],   %[ftmp1],       %[ftmp7]            \n\t" \
+  "psubw      %[ftmp6],   %[ftmp2],       %[ftmp8]            \n\t" \
+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp3]            \n\t" \
+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp4]            \n\t" \
+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp6]            \n\t"
+
+#define VARIANCE_SSE_SUM_4                                          \
+  /* sse */                                                         \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp5],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "paddw      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t" \
+                                                                    \
+  /* sum */                                                         \
+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
+
+#define VARIANCE_SSE_SUM_8                                          \
+  /* sse */                                                         \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t" \
+                                                                    \
+  /* sum */                                                         \
+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp3]            \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t" \
+  "paddh      %[ftmp12],  %[ftmp12],      %[ftmp5]            \n\t" \
+  "paddh      %[ftmp12],  %[ftmp12],      %[ftmp6]            \n\t"
+
+#define VARIANCE_SSE_8                                              \
+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t" \
+  "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t" \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t"
+
+#define VARIANCE_SSE_16                                             \
+  VARIANCE_SSE_8                                                    \
+  "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t" \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A                       \
+  /* calculate fdata3[0]~fdata3[3], store at ftmp2*/                \
+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp3],   %[ftmp3],       %[filter_x1]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t" \
+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B                       \
+  /* calculate fdata3[0]~fdata3[3], store at ftmp4*/                \
+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x1]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t" \
+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A                      \
+  /* calculate: temp2[0] ~ temp2[3] */                              \
+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp4],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t" \
+                                                                    \
+  /* store: temp2[0] ~ temp2[3] */                                  \
+  "and        %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
+  "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B                      \
+  /* calculate: temp2[0] ~ temp2[3] */                              \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp2],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t" \
+                                                                    \
+  /* store: temp2[0] ~ temp2[3] */                                  \
+  "and        %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
+  "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t" \
+  "gssdrc1    %[ftmp4],   0x00(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A                       \
+  /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/      \
+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_x0]        \n\t" \
+  "pmullh     %[ftmp3],   %[ftmp3],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
+  "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x1]        \n\t" \
+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x1]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t" \
+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp14]           \n\t" \
+  "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp14]           \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B                       \
+  /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/      \
+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp8],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp9],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp10],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp11],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp8],   %[ftmp8],       %[filter_x0]        \n\t" \
+  "pmullh     %[ftmp9],   %[ftmp9],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ff_ph_40]         \n\t" \
+  "paddh      %[ftmp9],   %[ftmp9],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp10],  %[ftmp10],      %[filter_x1]        \n\t" \
+  "pmullh     %[ftmp11],  %[ftmp11],      %[filter_x1]        \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t" \
+  "paddh      %[ftmp9],   %[ftmp9],       %[ftmp11]           \n\t" \
+  "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp14]           \n\t" \
+  "psrlh      %[ftmp9],   %[ftmp9],       %[ftmp14]           \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A                      \
+  /* calculate: temp2[0] ~ temp2[3] */                              \
+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp8],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* calculate: temp2[4] ~ temp2[7] */                              \
+  "pmullh     %[ftmp3],   %[ftmp3],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp9],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* store: temp2[0] ~ temp2[7] */                                  \
+  "and        %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
+  "and        %[ftmp3],   %[ftmp3],       %[mask]             \n\t" \
+  "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t" \
+  "gssdlc1    %[ftmp2],   0x07(%[temp2_ptr])                  \n\t" \
+  "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B                      \
+  /* calculate: temp2[0] ~ temp2[3] */                              \
+  "pmullh     %[ftmp8],   %[ftmp8],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp2],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* calculate: temp2[4] ~ temp2[7] */                              \
+  "pmullh     %[ftmp9],   %[ftmp9],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp9],   %[ftmp9],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp3],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp9],   %[ftmp9],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp9],   %[ftmp9],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* store: temp2[0] ~ temp2[7] */                                  \
+  "and        %[ftmp8],   %[ftmp8],       %[mask]             \n\t" \
+  "and        %[ftmp9],   %[ftmp9],       %[mask]             \n\t" \
+  "packushb   %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t" \
+  "gssdlc1    %[ftmp8],   0x07(%[temp2_ptr])                  \n\t" \
+  "gssdrc1    %[ftmp8],   0x00(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A                      \
+  /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/      \
+  VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A                             \
+                                                                    \
+  /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/     \
+  "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x10(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x09(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x0]        \n\t" \
+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
+  "paddh      %[ftmp5],   %[ftmp5],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp6],   %[ftmp6],       %[filter_x1]        \n\t" \
+  "pmullh     %[ftmp7],   %[ftmp7],       %[filter_x1]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t" \
+  "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t" \
+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp14]           \n\t" \
+  "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp14]           \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B                      \
+  /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/      \
+  VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B                             \
+                                                                    \
+  /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/   \
+  "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp10],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp11],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x10(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x09(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp12],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp13],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp10],  %[ftmp10],      %[filter_x0]        \n\t" \
+  "pmullh     %[ftmp11],  %[ftmp11],      %[filter_x0]        \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ff_ph_40]         \n\t" \
+  "paddh      %[ftmp11],  %[ftmp11],      %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp12],  %[ftmp12],      %[filter_x1]        \n\t" \
+  "pmullh     %[ftmp13],  %[ftmp13],      %[filter_x1]        \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp12]           \n\t" \
+  "paddh      %[ftmp11],  %[ftmp11],      %[ftmp13]           \n\t" \
+  "psrlh      %[ftmp10],  %[ftmp10],      %[ftmp14]           \n\t" \
+  "psrlh      %[ftmp11],  %[ftmp11],      %[ftmp14]           \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A                     \
+  VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A                            \
+                                                                    \
+  /* calculate: temp2[8] ~ temp2[11] */                             \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp10],      %[filter_y1]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* calculate: temp2[12] ~ temp2[15] */                            \
+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp5],   %[ftmp5],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp11],       %[filter_y1]       \n\t" \
+  "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* store: temp2[8] ~ temp2[15] */                                 \
+  "and        %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
+  "and        %[ftmp5],   %[ftmp5],       %[mask]             \n\t" \
+  "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t" \
+  "gssdlc1    %[ftmp4],   0x0f(%[temp2_ptr])                  \n\t" \
+  "gssdrc1    %[ftmp4],   0x08(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B                     \
+  VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B                            \
+                                                                    \
+  /* calculate: temp2[8] ~ temp2[11] */                             \
+  "pmullh     %[ftmp10],  %[ftmp10],      %[filter_y0]        \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp4],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp10],  %[ftmp10],      %[ftmp14]           \n\t" \
+                                                                    \
+  /* calculate: temp2[12] ~ temp2[15] */                            \
+  "pmullh     %[ftmp11],  %[ftmp11],      %[filter_y0]        \n\t" \
+  "paddh      %[ftmp11],  %[ftmp11],      %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp5],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp11],  %[ftmp11],      %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp11],  %[ftmp11],      %[ftmp14]           \n\t" \
+                                                                    \
+  /* store: temp2[8] ~ temp2[15] */                                 \
+  "and        %[ftmp10],  %[ftmp10],      %[mask]             \n\t" \
+  "and        %[ftmp11],  %[ftmp11],      %[mask]             \n\t" \
+  "packushb   %[ftmp10],  %[ftmp10],      %[ftmp11]           \n\t" \
+  "gssdlc1    %[ftmp10],  0x0f(%[temp2_ptr])                  \n\t" \
+  "gssdrc1    %[ftmp10],  0x08(%[temp2_ptr])                  \n\t"
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
+                                              unsigned int src_pixels_per_line,
+                                              int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO(
+          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
+                                               unsigned int src_pixels_per_line,
+                                               unsigned int pixel_step,
+                                               unsigned int output_height,
+                                               unsigned int output_width,
+                                               const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO(
+          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse, int high) {
+  int sum;
+  double ftmp[12];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x27(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x20(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x27(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x20(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x2f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x28(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x2f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x28(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x37(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x30(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x37(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x30(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x3f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x38(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x3f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x38(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
+    "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"
+    "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"
+    "dsrl       %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
+    "paddw      %[ftmp1],   %[ftmp1],       %[ftmp10]           \n\t"
+    "swc1       %[ftmp1],   0x00(%[sse])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+      [tmp2]"=&r"(tmp[2]),
+      [a]"+&r"(a),                      [b]"+&r"(b),
+      [sum]"=&r"(sum)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse)
+    : "memory"
+  );
+
+  return *sse - (((int64_t)sum * sum) / (64 * high));
+}
+
+#define VPX_VARIANCE64XN(n)                                         \
+  uint32_t vpx_variance64x##n##_mmi(const uint8_t *a, int a_stride, \
+                                    const uint8_t *b, int b_stride, \
+                                    uint32_t *sse) {                \
+    return vpx_variance64x(a, a_stride, b, b_stride, sse, n);       \
+  }
+
+VPX_VARIANCE64XN(64)
+VPX_VARIANCE64XN(32)
+
+uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, uint32_t *sse) {
+  int sum;
+  double ftmp[12];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "li         %[tmp0],    0x40                                \n\t"
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
+    "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"
+    "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"
+    "dsrl       %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
+    "paddw      %[ftmp1],   %[ftmp1],       %[ftmp10]           \n\t"
+    "swc1       %[ftmp1],   0x00(%[sse])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+      [tmp2]"=&r"(tmp[2]),
+      [a]"+&r"(a),                      [b]"+&r"(b),
+      [sum]"=&r"(sum)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [sse]"r"(sse)
+    : "memory"
+  );
+
+  return *sse - (((int64_t)sum * sum) / 2048);
+}
+
+static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse, int high) {
+  int sum;
+  double ftmp[13];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8
+    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8
+    "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8
+    "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+
+    "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
+
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
+      [a]"+&r"(a),                      [b]"+&r"(b)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+    : "memory"
+  );
+
+  return *sse - (((int64_t)sum * sum) / (32 * high));
+}
+
+#define VPX_VARIANCE32XN(n)                                         \
+  uint32_t vpx_variance32x##n##_mmi(const uint8_t *a, int a_stride, \
+                                    const uint8_t *b, int b_stride, \
+                                    uint32_t *sse) {                \
+    return vpx_variance32x(a, a_stride, b, b_stride, sse, n);       \
+  }
+
+VPX_VARIANCE32XN(32)
+VPX_VARIANCE32XN(16)
+
+static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse, int high) {
+  int sum;
+  double ftmp[13];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8
+    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+
+    "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
+
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
+      [a]"+&r"(a),                      [b]"+&r"(b)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+    : "memory"
+  );
+
+  return *sse - (((int64_t)sum * sum) / (16 * high));
+}
+
+#define VPX_VARIANCE16XN(n)                                         \
+  uint32_t vpx_variance16x##n##_mmi(const uint8_t *a, int a_stride, \
+                                    const uint8_t *b, int b_stride, \
+                                    uint32_t *sse) {                \
+    return vpx_variance16x(a, a_stride, b, b_stride, sse, n);       \
+  }
+
+VPX_VARIANCE16XN(32)
+VPX_VARIANCE16XN(16)
+VPX_VARIANCE16XN(8)
+
+static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      uint32_t *sse, int high) {
+  int sum;
+  double ftmp[13];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+
+    "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
+
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
+      [a]"+&r"(a),                      [b]"+&r"(b)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+    : "memory"
+  );
+
+  return *sse - (((int64_t)sum * sum) / (8 * high));
+}
+
+#define VPX_VARIANCE8XN(n)                                         \
+  uint32_t vpx_variance8x##n##_mmi(const uint8_t *a, int a_stride, \
+                                   const uint8_t *b, int b_stride, \
+                                   uint32_t *sse) {                \
+    return vpx_variance8x(a, a_stride, b, b_stride, sse, n);       \
+  }
+
+VPX_VARIANCE8XN(16)
+VPX_VARIANCE8XN(8)
+VPX_VARIANCE8XN(4)
+
+static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      uint32_t *sse, int high) {
+  int sum;
+  double ftmp[12];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+    "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_4
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "dsrl       %[ftmp9],   %[ftmp6],       %[ftmp10]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp6]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+
+    "punpcklhw  %[ftmp3],   %[ftmp7],       %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp7],       %[ftmp0]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp8],       %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp8],       %[ftmp0]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp10]           \n\t"
+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),
+      [tmp0]"=&r"(tmp[0]),
+      [a]"+&r"(a),                      [b]"+&r"(b)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+    : "memory"
+  );
+
+  return *sse - (((int64_t)sum * sum) / (4 * high));
+}
+
+#define VPX_VARIANCE4XN(n)                                         \
+  uint32_t vpx_variance4x##n##_mmi(const uint8_t *a, int a_stride, \
+                                   const uint8_t *b, int b_stride, \
+                                   uint32_t *sse) {                \
+    return vpx_variance4x(a, a_stride, b, b_stride, sse, n);       \
+  }
+
+VPX_VARIANCE4XN(8)
+VPX_VARIANCE4XN(4)
+
+static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride,
+                                  const uint8_t *b, int b_stride, uint32_t *sse,
+                                  uint64_t high) {
+  double ftmp[12];
+  uint32_t tmp[1];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+
+    "1:                                                         \n\t"
+    VARIANCE_SSE_16
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [a]"+&r"(a),                      [b]"+&r"(b)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse)
+    : "memory"
+  );
+
+  return *sse;
+}
+
+#define vpx_mse16xN(n)                                         \
+  uint32_t vpx_mse16x##n##_mmi(const uint8_t *a, int a_stride, \
+                               const uint8_t *b, int b_stride, \
+                               uint32_t *sse) {                \
+    return vpx_mse16x(a, a_stride, b, b_stride, sse, n);       \
+  }
+
+vpx_mse16xN(16);
+vpx_mse16xN(8);
+
+static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, uint32_t *sse,
+                                 uint64_t high) {
+  double ftmp[12];
+  uint32_t tmp[1];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+
+    "1:                                                         \n\t"
+    VARIANCE_SSE_8
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [a]"+&r"(a),                      [b]"+&r"(b)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse)
+    : "memory"
+  );
+
+  return *sse;
+}
+
+#define vpx_mse8xN(n)                                                          \
+  uint32_t vpx_mse8x##n##_mmi(const uint8_t *a, int a_stride,                  \
+                              const uint8_t *b, int b_stride, uint32_t *sse) { \
+    return vpx_mse8x(a, a_stride, b, b_stride, sse, n);                        \
+  }
+
+vpx_mse8xN(16);
+vpx_mse8xN(8);
+
+#define SUBPIX_VAR(W, H)                                                \
+  uint32_t vpx_sub_pixel_variance##W##x##H##_mmi(                       \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
+      const uint8_t *b, int b_stride, uint32_t *sse) {                  \
+    uint16_t fdata3[(H + 1) * W];                                       \
+    uint8_t temp2[H * W];                                               \
+                                                                        \
+    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+                                      bilinear_filters[xoffset]);       \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                       bilinear_filters[yoffset]);      \
+                                                                        \
+    return vpx_variance##W##x##H##_mmi(temp2, W, b, b_stride, sse);     \
+  }
+
+SUBPIX_VAR(64, 64)
+SUBPIX_VAR(64, 32)
+SUBPIX_VAR(32, 64)
+SUBPIX_VAR(32, 32)
+SUBPIX_VAR(32, 16)
+SUBPIX_VAR(16, 32)
+
+static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride,
+                                              int xoffset, int yoffset,
+                                              uint8_t *temp2, int counter) {
+  uint8_t *temp2_ptr = temp2;
+  mips_reg l_counter = counter;
+  double ftmp[15];
+  mips_reg tmp[2];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+
+  const uint8_t *filter_x = bilinear_filters[xoffset];
+  const uint8_t *filter_y = bilinear_filters[yoffset];
+
+  __asm__ volatile (
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x07)
+    MMI_MTC1(%[tmp0], %[ftmp14])
+    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
+    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
+    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
+    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
+
+    // fdata3: fdata3[0] ~ fdata3[15]
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+
+    // fdata3 +a_stride*1: fdata3[0] ~ fdata3[15]
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
+    // temp2: temp2[0] ~ temp2[15]
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
+
+    // fdata3 +a_stride*2: fdata3[0] ~ fdata3[15]
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+    // temp2+16*1: temp2[0] ~ temp2[15]
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
+
+    "1:                                                         \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
+
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
+    "addiu      %[counter], %[counter],     -0x01               \n\t"
+    "bnez       %[counter], 1b                                  \n\t"
+    : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+      [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+      [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+      [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+      [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+      [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
+      [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),
+      [counter]"+&r"(l_counter)
+    : [filter_x0] "f"((uint64_t)filter_x[0]),
+      [filter_x1] "f"((uint64_t)filter_x[1]),
+      [filter_y0] "f"((uint64_t)filter_y[0]),
+      [filter_y1] "f"((uint64_t)filter_y[1]),
+      [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+      [mask] "f"(mask)
+    : "memory"
+  );
+}
+
+#define SUBPIX_VAR16XN(H)                                            \
+  uint32_t vpx_sub_pixel_variance16x##H##_mmi(                       \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,      \
+      const uint8_t *b, int b_stride, uint32_t *sse) {               \
+    uint8_t temp2[16 * H];                                           \
+    var_filter_block2d_bil_16x(a, a_stride, xoffset, yoffset, temp2, \
+                               (H - 2) / 2);                         \
+                                                                     \
+    return vpx_variance16x##H##_mmi(temp2, 16, b, b_stride, sse);    \
+  }
+
+SUBPIX_VAR16XN(16)
+SUBPIX_VAR16XN(8)
+
+static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             uint8_t *temp2, int counter) {
+  uint8_t *temp2_ptr = temp2;
+  mips_reg l_counter = counter;
+  double ftmp[15];
+  mips_reg tmp[2];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  const uint8_t *filter_x = bilinear_filters[xoffset];
+  const uint8_t *filter_y = bilinear_filters[yoffset];
+
+  __asm__ volatile (
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x07)
+    MMI_MTC1(%[tmp0], %[ftmp14])
+    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
+    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
+    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
+    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
+
+    // fdata3: fdata3[0] ~ fdata3[7]
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+
+    // fdata3 +a_stride*1: fdata3[0] ~ fdata3[7]
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
+    // temp2: temp2[0] ~ temp2[7]
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
+
+    // fdata3 +a_stride*2: fdata3[0] ~ fdata3[7]
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+    // temp2+8*1: temp2[0] ~ temp2[7]
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
+
+    "1:                                                         \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
+
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
+    "addiu      %[counter], %[counter],     -0x01               \n\t"
+    "bnez       %[counter], 1b                                  \n\t"
+    : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+      [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+      [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+      [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+      [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+      [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
+      [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),
+      [counter]"+&r"(l_counter)
+    : [filter_x0] "f"((uint64_t)filter_x[0]),
+      [filter_x1] "f"((uint64_t)filter_x[1]),
+      [filter_y0] "f"((uint64_t)filter_y[0]),
+      [filter_y1] "f"((uint64_t)filter_y[1]),
+      [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+      [mask] "f"(mask)
+    : "memory"
+  );
+}
+
+#define SUBPIX_VAR8XN(H)                                            \
+  uint32_t vpx_sub_pixel_variance8x##H##_mmi(                       \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
+      const uint8_t *b, int b_stride, uint32_t *sse) {              \
+    uint8_t temp2[8 * H];                                           \
+    var_filter_block2d_bil_8x(a, a_stride, xoffset, yoffset, temp2, \
+                              (H - 2) / 2);                         \
+                                                                    \
+    return vpx_variance8x##H##_mmi(temp2, 8, b, b_stride, sse);     \
+  }
+
+SUBPIX_VAR8XN(16)
+SUBPIX_VAR8XN(8)
+SUBPIX_VAR8XN(4)
+
+static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             uint8_t *temp2, int counter) {
+  uint8_t *temp2_ptr = temp2;
+  mips_reg l_counter = counter;
+  double ftmp[7];
+  mips_reg tmp[2];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  const uint8_t *filter_x = bilinear_filters[xoffset];
+  const uint8_t *filter_y = bilinear_filters[yoffset];
+
+  __asm__ volatile (
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x07)
+    MMI_MTC1(%[tmp0], %[ftmp6])
+    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
+    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
+    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
+    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
+    // fdata3: fdata3[0] ~ fdata3[3]
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+
+    // fdata3 +a_stride*1: fdata3[0] ~ fdata3[3]
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
+    // temp2: temp2[0] ~ temp2[7]
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
+
+    // fdata3 +a_stride*2: fdata3[0] ~ fdata3[3]
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+    // temp2+4*1: temp2[0] ~ temp2[7]
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
+
+    "1:                                                         \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
+
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
+    "addiu      %[counter], %[counter],     -0x01               \n\t"
+    "bnez       %[counter], 1b                                  \n\t"
+    : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+      [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+      [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [a] "+&r"(a),
+      [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
+    : [filter_x0] "f"((uint64_t)filter_x[0]),
+      [filter_x1] "f"((uint64_t)filter_x[1]),
+      [filter_y0] "f"((uint64_t)filter_y[0]),
+      [filter_y1] "f"((uint64_t)filter_y[1]),
+      [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+      [mask] "f"(mask)
+    : "memory"
+  );
+}
+
+#define SUBPIX_VAR4XN(H)                                            \
+  uint32_t vpx_sub_pixel_variance4x##H##_mmi(                       \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
+      const uint8_t *b, int b_stride, uint32_t *sse) {              \
+    uint8_t temp2[4 * H];                                           \
+    var_filter_block2d_bil_4x(a, a_stride, xoffset, yoffset, temp2, \
+                              (H - 2) / 2);                         \
+                                                                    \
+    return vpx_variance4x##H##_mmi(temp2, 4, b, b_stride, sse);     \
+  }
+
+SUBPIX_VAR4XN(8)
+SUBPIX_VAR4XN(4)
+
+#define SUBPIX_AVG_VAR(W, H)                                            \
+  uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi(                   \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
+      const uint8_t *b, int b_stride, uint32_t *sse,                    \
+      const uint8_t *second_pred) {                                     \
+    uint16_t fdata3[(H + 1) * W];                                       \
+    uint8_t temp2[H * W];                                               \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                         \
+                                                                        \
+    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+                                      bilinear_filters[xoffset]);       \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                       bilinear_filters[yoffset]);      \
+                                                                        \
+    vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W);            \
+                                                                        \
+    return vpx_variance##W##x##H##_mmi(temp3, W, b, b_stride, sse);     \
+  }
+
+SUBPIX_AVG_VAR(64, 64)
+SUBPIX_AVG_VAR(64, 32)
+SUBPIX_AVG_VAR(32, 64)
+SUBPIX_AVG_VAR(32, 32)
+SUBPIX_AVG_VAR(32, 16)
+SUBPIX_AVG_VAR(16, 32)
+SUBPIX_AVG_VAR(16, 16)
+SUBPIX_AVG_VAR(16, 8)
+SUBPIX_AVG_VAR(8, 16)
+SUBPIX_AVG_VAR(8, 8)
+SUBPIX_AVG_VAR(8, 4)
+SUBPIX_AVG_VAR(4, 8)
+SUBPIX_AVG_VAR(4, 4)
diff --git a/libs/libvpx/vpx_dsp/mips/variance_msa.c b/libs/libvpx/vpx_dsp/mips/variance_msa.c
index 085990e484..49b2f99230 100644
--- a/libs/libvpx/vpx_dsp/mips/variance_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/variance_msa.c
@@ -489,27 +489,19 @@ static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
 
 uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
                               const uint8_t *ref_ptr, int32_t ref_stride) {
-  uint32_t err = 0;
   uint32_t src0, src1, src2, src3;
   uint32_t ref0, ref1, ref2, ref3;
   v16i8 src = { 0 };
   v16i8 ref = { 0 };
-  v16u8 src_vec0, src_vec1;
-  v8i16 diff0, diff1;
   v4i32 err0 = { 0 };
-  v4i32 err1 = { 0 };
 
   LW4(src_ptr, src_stride, src0, src1, src2, src3);
   LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
   INSERT_W4_SB(src0, src1, src2, src3, src);
   INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
-  ILVRL_B2_UB(src, ref, src_vec0, src_vec1);
-  HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1);
-  DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1);
-  err = HADD_SW_S32(err0);
-  err += HADD_SW_S32(err1);
+  CALC_MSE_B(src, ref, err0);
 
-  return err;
+  return HADD_SW_S32(err0);
 }
 
 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
index ad2af28669..187a013421 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -16,8 +16,9 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 dst0, dst1, dst2, dst3, res2, res3;
+  v16u8 dst0 = { 0 }, res;
   v16u8 mask0, mask1, mask2, mask3;
   v8i16 filt, res0, res1;
 
@@ -36,23 +37,23 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
   XORI_B4_128_SB(src0, src1, src2, src3);
   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
                              filt0, filt1, filt2, filt3, res0, res1);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
   SRARI_H2_SH(res0, res1, FILTER_BITS);
   SAT_SH2_SH(res0, res1, 7);
-  PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-  XORI_B2_128_UB(res2, res3);
-  AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+  res = PCKEV_XORI128_UB(res0, res1);
+  res = (v16u8)__msa_aver_u_b(res, dst0);
+  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
   v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v8i16 filt, vec0, vec1, vec2, vec3;
 
   mask0 = LD_UB(&mc_filt_mask_arr[16]);
@@ -69,7 +70,10 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   XORI_B4_128_SB(src0, src1, src2, src3);
   src += (4 * src_stride);
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
                              filt0, filt1, filt2, filt3, vec0, vec1);
   LD_SB4(src, src_stride, src0, src1, src2, src3);
@@ -82,10 +86,7 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
               res3);
   ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
   XORI_B2_128_UB(res0, res2);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
-  ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
-  AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
+  AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
   ST4x8_UB(res0, res2, dst, dst_stride);
 }
 
@@ -105,8 +106,9 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
                                              int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   int32_t loop_cnt;
+  int64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+  v16u8 mask0, mask1, mask2, mask3, dst0 = { 0 }, dst1 = { 0 };
   v8i16 filt, out0, out1, out2, out3;
 
   mask0 = LD_UB(&mc_filt_mask_arr[0]);
@@ -127,10 +129,12 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
                                mask3, filt0, filt1, filt2, filt3, out0, out1,
                                out2, out3);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
     SAT_SH4_SH(out0, out1, out2, out3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
+    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst,
                             dst_stride);
     dst += (4 * dst_stride);
   }
@@ -309,8 +313,9 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
+  v16u8 filt0, dst0 = { 0 }, vec0, vec1, res;
   v8u16 vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[16]);
@@ -320,23 +325,24 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
   LD_SB4(src, src_stride, src0, src1, src2, src3);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
   DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
   SRARI_H2_UH(vec2, vec3, FILTER_BITS);
-  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  res = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+  res = (v16u8)__msa_aver_u_b(res, dst0);
+  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v8u16 vec4, vec5, vec6, vec7, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[16]);
@@ -346,7 +352,10 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
   VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
@@ -354,13 +363,9 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
   SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
   PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
               res3);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
-  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
-              res3);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
+  AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
+  ST4x8_UB(res0, res2, dst, dst_stride);
 }
 
 static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
@@ -378,8 +383,9 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  int64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v16u8 filt0, dst0 = { 0 }, dst1 = { 0 };
   v8u16 vec0, vec1, vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
@@ -394,16 +400,18 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
+  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
 }
 
 static void common_hz_2t_and_aver_dst_8x8mult_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter, int32_t height) {
+  int64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v16u8 filt0, dst0 = { 0 }, dst1 = { 0 };
   v8u16 vec0, vec1, vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
@@ -419,11 +427,12 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   src += (4 * src_stride);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
+  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
   dst += (4 * dst_stride);
 
   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -431,9 +440,10 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
+  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
   dst += (4 * dst_stride);
 
   if (16 == height) {
@@ -445,10 +455,11 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
                 vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
     LD_SB4(src, src_stride, src0, src1, src2, src3);
-    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                       dst_stride);
+    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
     dst += (4 * dst_stride);
 
     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -456,9 +467,10 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
                 vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                       dst_stride);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
   }
 }
 
@@ -633,9 +645,10 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
 
 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
                                  int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   int8_t cnt, filt_hor[8];
 
   assert(x_step_q4 == 16);
@@ -668,8 +681,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           (int32_t)dst_stride, &filt_hor[3], h);
         break;
       default:
-        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else {
@@ -695,8 +708,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           (int32_t)dst_stride, filt_hor, h);
         break;
       default:
-        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
index 1cfa63201c..5187cea21c 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -16,8 +16,9 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt;
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
+  v16u8 dst0 = { 0 }, mask0, mask1, mask2, mask3, res;
   v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
   v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
@@ -59,7 +60,8 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
     XORI_B4_128_SB(src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
     hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
                               filt_hz1, filt_hz2, filt_hz3);
     hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
@@ -73,14 +75,12 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
     vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
     res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
-    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
 
     SRARI_H2_SH(res0, res1, FILTER_BITS);
     SAT_SH2_SH(res0, res1, 7);
-    PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
-    XORI_B2_128_UB(tmp0, tmp1);
-    AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
-    ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+    res = PCKEV_XORI128_UB(res0, res1);
+    res = (v16u8)__msa_aver_u_b(res, dst0);
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
     dst += (4 * dst_stride);
 
     hz_out5 = hz_out9;
@@ -94,10 +94,11 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt;
+  uint64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
   v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
   v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
+  v16u8 dst0 = { 0 }, dst1 = { 0 }, mask0, mask1, mask2, mask3;
   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
   v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
   v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
@@ -144,7 +145,9 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
     XORI_B4_128_SB(src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
 
     hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
                               filt_hz1, filt_hz2, filt_hz3);
@@ -172,7 +175,7 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
 
     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
+    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst,
                             dst_stride);
     dst += (4 * dst_stride);
 
@@ -225,9 +228,10 @@ static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
 static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, mask;
   v16u8 filt_hz, filt_vt, vec0, vec1;
-  v16u8 dst0, dst1, dst2, dst3, res0, res1;
+  v16u8 dst0 = { 0 }, out;
   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[16]);
@@ -248,21 +252,22 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
   hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
   DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
   SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
-  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+  out = __msa_aver_u_b(out, dst0);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
-  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
   v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
@@ -289,21 +294,18 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
              hz_out3, hz_out5, 8);
   hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
 
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
   ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
               tmp1, tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
-              res3);
-  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
-              res3);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
+  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+  ST4x8_UB(res0, res1, dst, dst_stride);
 }
 
 static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
@@ -321,8 +323,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
 static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert) {
+  uint64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+  v16u8 filt_hz, filt_vt, dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3;
   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
 
@@ -338,7 +341,9 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
   src += (5 * src_stride);
 
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
   hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
   vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
@@ -357,16 +362,16 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
   tmp3 = __msa_dotp_u_h(vec3, filt_vt);
 
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                     dst_stride);
+  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
 }
 
 static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt;
+  uint64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
+  v16u8 filt_hz, filt_vt, vec0, dst0 = { 0 }, dst1 = { 0 };
   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
 
@@ -407,9 +412,10 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
 
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                       dst_stride);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
     dst += (4 * dst_stride);
   }
 }
@@ -516,9 +522,10 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
 
 void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
   int8_t cnt, filt_hor[8], filt_ver[8];
 
   assert(x_step_q4 == 16);
@@ -560,14 +567,14 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                                                &filt_hor[3], &filt_ver[3], h);
         break;
       default:
-        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else if (((const int32_t *)filter_x)[0] == 0 ||
              ((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     switch (w) {
       case 4:
@@ -596,8 +603,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                                                filt_ver, h);
         break;
       default:
-        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
index 146ce3b2f5..ef8c901140 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -17,8 +17,9 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
                                              int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   uint32_t loop_cnt;
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3, out;
+  v16u8 dst0 = { 0 }, out;
   v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
   v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
   v16i8 src10998, filt0, filt1, filt2, filt3;
@@ -43,7 +44,8 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
     LD_SB4(src, src_stride, src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
     ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
                src87_r, src98_r, src109_r);
     ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
@@ -55,9 +57,6 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
     SRARI_H2_SH(out10, out32, FILTER_BITS);
     SAT_SH2_SH(out10, out32, 7);
     out = PCKEV_XORI128_UB(out10, out32);
-    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-
-    dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
     out = __msa_aver_u_b(out, dst0);
 
     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
@@ -75,8 +74,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
                                              int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   uint32_t loop_cnt;
+  uint64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
   v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
   v8i16 filt, out0, out1, out2, out3;
@@ -98,7 +98,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
     LD_SB4(src, src_stride, src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
     XORI_B4_128_SB(src7, src8, src9, src10);
     ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
                src87_r, src98_r, src109_r);
@@ -112,7 +114,7 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
                                filt1, filt2, filt3);
     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
     SAT_SH4_SH(out0, out1, out2, out3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
+    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst,
                             dst_stride);
     dst += (4 * dst_stride);
 
@@ -246,8 +248,9 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4;
-  v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+  v16u8 dst0 = { 0 }, out, filt0, src2110, src4332;
   v16i8 src10_r, src32_r, src21_r, src43_r;
   v8i16 filt;
   v8u16 tmp0, tmp1;
@@ -261,9 +264,8 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
   src4 = LD_SB(src);
   src += src_stride;
 
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
-  dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
              src32_r, src43_r);
   ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
@@ -280,7 +282,8 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  uint32_t tp0, tp1, tp2, tp3;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
   v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
   v16u8 src2110, src4332, src6554, src8776, filt0;
@@ -294,10 +297,10 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
   src += (8 * src_stride);
   src8 = LD_SB(src);
 
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2,
-             dst3);
-  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
              src32_r, src43_r);
   ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
@@ -309,9 +312,7 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
   PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
   AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
-  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
+  ST4x8_UB(src2110, src4332, dst, dst_stride);
 }
 
 static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
@@ -329,8 +330,9 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  int64_t tp0, tp1, tp2, tp3;
   v16u8 src0, src1, src2, src3, src4;
-  v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+  v16u8 dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3, filt0;
   v8u16 tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
 
@@ -339,22 +341,24 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
   filt0 = (v16u8)__msa_splati_h(filt, 0);
 
   LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
   ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
   ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
               tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                     dst_stride);
+  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
 }
 
 static void common_vt_2t_and_aver_dst_8x8mult_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
+  int64_t tp0, tp1, tp2, tp3;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-  v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
   v8u16 tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
@@ -369,7 +373,12 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(
   for (loop_cnt = (height >> 3); loop_cnt--;) {
     LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
     src += (8 * src_stride);
-    LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst2);
+    INSERT_D2_UB(tp2, tp3, dst3);
 
     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
                vec3);
@@ -378,15 +387,13 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
                 tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst,
-                       dst_stride);
+    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
     dst += (4 * dst_stride);
 
     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
                 tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst,
-                       dst_stride);
+    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride);
     dst += (4 * dst_stride);
 
     src0 = src8;
@@ -605,9 +612,10 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
 
 void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4, int w,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
                                 int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   int8_t cnt, filt_ver[8];
 
   assert(y_step_q4 == 16);
@@ -640,8 +648,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           (int32_t)dst_stride, &filt_ver[3], h);
         break;
       default:
-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else {
@@ -668,8 +676,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           (int32_t)dst_stride, filt_ver, h);
         break;
       default:
-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
index 9e8bf7b519..152dc26104 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -621,9 +621,10 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
                              int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   int8_t cnt, filt_hor[8];
 
   assert(x_step_q4 == 16);
@@ -656,8 +657,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                              &filt_hor[3], h);
         break;
       default:
-        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else {
@@ -683,8 +684,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                              filt_hor, h);
         break;
       default:
-        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
index b16ec57886..d35a5a7a63 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -541,9 +541,11 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
 }
 
 void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int32_t x_step_q4, const int16_t *filter_y,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int32_t x_step_q4, int y0_q4,
                        int32_t y_step_q4, int32_t w, int32_t h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
   int8_t cnt, filt_hor[8], filt_ver[8];
 
   assert(x_step_q4 == 16);
@@ -585,14 +587,14 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                                   &filt_ver[3], (int32_t)h);
         break;
       default:
-        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else if (((const int32_t *)filter_x)[0] == 0 ||
              ((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                    filter_y, y_step_q4, w, h);
+    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                    y0_q4, y_step_q4, w, h);
   } else {
     switch (w) {
       case 4:
@@ -621,9 +623,605 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                                   (int32_t)h);
         break;
       default:
-        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
 }
+
+static void filter_horiz_w4_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+                                uint8_t *dst, const int16_t *x_filter) {
+  uint64_t srcd0, srcd1, srcd2, srcd3;
+  uint32_t res;
+  v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
+  v16i8 out0, out1;
+  v16i8 shf1 = { 0, 8, 16, 24, 4, 12, 20, 28, 1, 9, 17, 25, 5, 13, 21, 29 };
+  v16i8 shf2 = shf1 + 2;
+  v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
+  v16i8 filt_shf1 = filt_shf0 + 2;
+  v16i8 filt_shf2 = filt_shf0 + 4;
+  v16i8 filt_shf3 = filt_shf0 + 6;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, filt0, filt1, filt2, filt3;
+
+  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src0);
+  INSERT_D2_UB(srcd2, srcd3, src1);
+  VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
+  XORI_B2_128_SB(out0, out1);
+  UNPCK_SB_SH(out0, src0_h, src1_h);
+  UNPCK_SB_SH(out1, src2_h, src3_h);
+
+  filt = LD_SH(x_filter);
+  VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
+  VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
+
+  src0_h *= filt0;
+  src0_h += src1_h * filt1;
+  src0_h += src2_h * filt2;
+  src0_h += src3_h * filt3;
+
+  src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
+
+  src0_h = __msa_adds_s_h(src0_h, src1_h);
+  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+  src0_h = __msa_sat_s_h(src0_h, 7);
+  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+  res = __msa_copy_u_w((v4i32)dst0, 0);
+  SW(res, dst);
+}
+
+static void filter_horiz_w8_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+                                uint8_t *dst, const int16_t *x_filter) {
+  uint64_t srcd0, srcd1, srcd2, srcd3;
+  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+  v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
+  v16i8 out0, out1, out2, out3;
+  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+  v16i8 shf2 = shf1 + 4;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src0);
+  INSERT_D2_UB(srcd2, srcd3, src1);
+  LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src2);
+  INSERT_D2_UB(srcd2, srcd3, src3);
+
+  filt = LD_SH(x_filter);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+  // transpose
+  VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
+  VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
+  ILVRL_W2_SB(tmp2, tmp0, out0, out1);
+  ILVRL_W2_SB(tmp3, tmp1, out2, out3);
+
+  XORI_B4_128_SB(out0, out1, out2, out3);
+  UNPCK_SB_SH(out0, src0_h, src1_h);
+  UNPCK_SB_SH(out1, src2_h, src3_h);
+  UNPCK_SB_SH(out2, src4_h, src5_h);
+  UNPCK_SB_SH(out3, src6_h, src7_h);
+
+  src0_h *= filt0;
+  src4_h *= filt4;
+  src0_h += src1_h * filt1;
+  src4_h += src5_h * filt5;
+  src0_h += src2_h * filt2;
+  src4_h += src6_h * filt6;
+  src0_h += src3_h * filt3;
+  src4_h += src7_h * filt7;
+
+  src0_h = __msa_adds_s_h(src0_h, src4_h);
+  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+  src0_h = __msa_sat_s_h(src0_h, 7);
+  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+  ST8x1_UB(dst0, dst);
+}
+
+static void filter_horiz_w16_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+                                 uint8_t *dst, const int16_t *x_filter) {
+  uint64_t srcd0, srcd1, srcd2, srcd3;
+  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+  v16u8 src4 = { 0 }, src5 = { 0 }, src6 = { 0 }, src7 = { 0 };
+  v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
+  v16i8 out0, out1, out2, out3, out4, out5, out6, out7;
+  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+  v16i8 shf2 = shf1 + 4;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+  v8i16 dst0_h, dst1_h, dst2_h, dst3_h;
+
+  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src0);
+  INSERT_D2_UB(srcd2, srcd3, src1);
+  LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src2);
+  INSERT_D2_UB(srcd2, srcd3, src3);
+  LD4(src_x + 8 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src4);
+  INSERT_D2_UB(srcd2, srcd3, src5);
+  LD4(src_x + 12 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src6);
+  INSERT_D2_UB(srcd2, srcd3, src7);
+
+  filt = LD_SH(x_filter);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+  // transpose
+  VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
+  VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
+  ILVRL_W2_SB(tmp2, tmp0, out0, out1);
+  ILVRL_W2_SB(tmp3, tmp1, out2, out3);
+  XORI_B4_128_SB(out0, out1, out2, out3);
+
+  UNPCK_SB_SH(out0, src0_h, src1_h);
+  UNPCK_SB_SH(out1, src2_h, src3_h);
+  UNPCK_SB_SH(out2, src4_h, src5_h);
+  UNPCK_SB_SH(out3, src6_h, src7_h);
+
+  VSHF_B2_UB(src4, src5, src4, src5, shf1, shf2, tmp0, tmp1);
+  VSHF_B2_UB(src6, src7, src6, src7, shf1, shf2, tmp2, tmp3);
+  ILVRL_W2_SB(tmp2, tmp0, out4, out5);
+  ILVRL_W2_SB(tmp3, tmp1, out6, out7);
+  XORI_B4_128_SB(out4, out5, out6, out7);
+
+  dst0_h = src0_h * filt0;
+  dst1_h = src4_h * filt4;
+  dst0_h += src1_h * filt1;
+  dst1_h += src5_h * filt5;
+  dst0_h += src2_h * filt2;
+  dst1_h += src6_h * filt6;
+  dst0_h += src3_h * filt3;
+  dst1_h += src7_h * filt7;
+
+  UNPCK_SB_SH(out4, src0_h, src1_h);
+  UNPCK_SB_SH(out5, src2_h, src3_h);
+  UNPCK_SB_SH(out6, src4_h, src5_h);
+  UNPCK_SB_SH(out7, src6_h, src7_h);
+
+  dst2_h = src0_h * filt0;
+  dst3_h = src4_h * filt4;
+  dst2_h += src1_h * filt1;
+  dst3_h += src5_h * filt5;
+  dst2_h += src2_h * filt2;
+  dst3_h += src6_h * filt6;
+  dst2_h += src3_h * filt3;
+  dst3_h += src7_h * filt7;
+
+  ADDS_SH2_SH(dst0_h, dst1_h, dst2_h, dst3_h, dst0_h, dst2_h);
+  SRARI_H2_SH(dst0_h, dst2_h, FILTER_BITS);
+  SAT_SH2_SH(dst0_h, dst2_h, 7);
+  dst0 = PCKEV_XORI128_UB(dst0_h, dst2_h);
+  ST_UB(dst0, dst);
+}
+
+static void transpose4x4_to_dst(const uint8_t *src, uint8_t *dst,
+                                ptrdiff_t dst_stride) {
+  v16u8 in0;
+  v16i8 out0 = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+  in0 = LD_UB(src);
+  out0 = __msa_vshf_b(out0, (v16i8)in0, (v16i8)in0);
+  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void transpose8x8_to_dst(const uint8_t *src, uint8_t *dst,
+                                ptrdiff_t dst_stride) {
+  v16u8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
+  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+  v16i8 shf2 = shf1 + 4;
+
+  LD_UB4(src, 16, in0, in1, in2, in3);
+  VSHF_B2_UB(in0, in1, in0, in1, shf1, shf2, tmp0, tmp1);
+  VSHF_B2_UB(in2, in3, in2, in3, shf1, shf2, tmp2, tmp3);
+  ILVRL_W2_UB(tmp2, tmp0, out0, out1);
+  ILVRL_W2_UB(tmp3, tmp1, out2, out3);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+  ST8x4_UB(out2, out3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void transpose16x16_to_dst(const uint8_t *src, uint8_t *dst,
+                                  ptrdiff_t dst_stride) {
+  v16u8 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12;
+  v16u8 in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7, out8;
+  v16u8 out9, out10, out11, out12, out13, out14, out15;
+
+  LD_UB8(src, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_UB8(src + 16 * 8, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+
+  TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                      in11, in12, in13, in14, in15, out0, out1, out2, out3,
+                      out4, out5, out6, out7);
+  ST_UB8(out0, out1, out2, out3, out4, out5, out6, out7, dst, dst_stride);
+  dst += 8 * dst_stride;
+
+  SLDI_B4_0_UB(in0, in1, in2, in3, in0, in1, in2, in3, 8);
+  SLDI_B4_0_UB(in4, in5, in6, in7, in4, in5, in6, in7, 8);
+  SLDI_B4_0_UB(in8, in9, in10, in11, in8, in9, in10, in11, 8);
+  SLDI_B4_0_UB(in12, in13, in14, in15, in12, in13, in14, in15, 8);
+
+  TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                      in11, in12, in13, in14, in15, out8, out9, out10, out11,
+                      out12, out13, out14, out15);
+  ST_UB8(out8, out9, out10, out11, out12, out13, out14, out15, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *x_filters, int x0_q4,
+                                    int x_step_q4, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+  int y, z, i;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; y += 4) {
+    int x_q4 = x0_q4;
+    for (z = 0; z < 4; ++z) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+      if (x_q4 & SUBPEL_MASK) {
+        filter_horiz_w4_msa(src_x, src_stride, temp + (z * 4), x_filter);
+      } else {
+        for (i = 0; i < 4; ++i) {
+          temp[z * 4 + i] = src_x[i * src_stride + 3];
+        }
+      }
+
+      x_q4 += x_step_q4;
+    }
+
+    transpose4x4_to_dst(temp, dst, dst_stride);
+
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+  }
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *x_filters, int x0_q4,
+                                    int x_step_q4, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+  int y, z, i;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 8x8 areas. The intermediate height is not always
+  // a multiple of 8, so force it to be a multiple of 8 here.
+  y = h + (8 - (h & 0x7));
+
+  do {
+    int x_q4 = x0_q4;
+    for (z = 0; z < 8; ++z) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+      if (x_q4 & SUBPEL_MASK) {
+        filter_horiz_w8_msa(src_x, src_stride, temp + (z * 8), x_filter);
+      } else {
+        for (i = 0; i < 8; ++i) {
+          temp[z * 8 + i] = src_x[3 + i * src_stride];
+        }
+      }
+
+      x_q4 += x_step_q4;
+    }
+
+    transpose8x8_to_dst(temp, dst, dst_stride);
+
+    src += src_stride * 8;
+    dst += dst_stride * 8;
+  } while (y -= 8);
+}
+
+static void scaledconvolve_horiz_mul16(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint8_t *dst, ptrdiff_t dst_stride,
+                                       const InterpKernel *x_filters, int x0_q4,
+                                       int x_step_q4, int w, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[16 * 16]);
+  int x, y, z, i;
+
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 16x16 areas.  The intermediate height is not always
+  // a multiple of 16, so force it to be a multiple of 8 here.
+  y = h + (16 - (h & 0xF));
+
+  do {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; x += 16) {
+      for (z = 0; z < 16; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+        if (x_q4 & SUBPEL_MASK) {
+          filter_horiz_w16_msa(src_x, src_stride, temp + (z * 16), x_filter);
+        } else {
+          for (i = 0; i < 16; ++i) {
+            temp[z * 16 + i] = src_x[3 + i * src_stride];
+          }
+        }
+
+        x_q4 += x_step_q4;
+      }
+
+      transpose16x16_to_dst(temp, dst + x, dst_stride);
+    }
+
+    src += src_stride * 16;
+    dst += dst_stride * 16;
+  } while (y -= 16);
+}
+
+static void filter_vert_w4_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+                               uint8_t *dst, const int16_t *y_filter) {
+  uint32_t srcw0, srcw1, srcw2, srcw3, srcw4, srcw5, srcw6, srcw7;
+  uint32_t res;
+  v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
+  v16i8 out0, out1;
+  v16i8 shf1 = { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 };
+  v16i8 shf2 = shf1 + 8;
+  v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
+  v16i8 filt_shf1 = filt_shf0 + 2;
+  v16i8 filt_shf2 = filt_shf0 + 4;
+  v16i8 filt_shf3 = filt_shf0 + 6;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h;
+  v8i16 filt0, filt1, filt2, filt3;
+
+  LW4(src_y, src_pitch, srcw0, srcw1, srcw2, srcw3);
+  LW4(src_y + 4 * src_pitch, src_pitch, srcw4, srcw5, srcw6, srcw7);
+  INSERT_W4_UB(srcw0, srcw1, srcw2, srcw3, src0);
+  INSERT_W4_UB(srcw4, srcw5, srcw6, srcw7, src1);
+  VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
+  XORI_B2_128_SB(out0, out1);
+  UNPCK_SB_SH(out0, src0_h, src1_h);
+  UNPCK_SB_SH(out1, src2_h, src3_h);
+
+  filt = LD_SH(y_filter);
+  VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
+  VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
+
+  src0_h *= filt0;
+  src0_h += src1_h * filt1;
+  src0_h += src2_h * filt2;
+  src0_h += src3_h * filt3;
+
+  src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
+
+  src0_h = __msa_adds_s_h(src0_h, src1_h);
+  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+  src0_h = __msa_sat_s_h(src0_h, 7);
+  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+  res = __msa_copy_u_w((v4i32)dst0, 0);
+  SW(res, dst);
+}
+
+static void filter_vert_w8_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+                               uint8_t *dst, const int16_t *y_filter) {
+  uint64_t srcd0, srcd1, srcd2, srcd3;
+  v16u8 dst0;
+  v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+  LD4(src_y, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_SB(srcd0, srcd1, src0);
+  INSERT_D2_SB(srcd2, srcd3, src1);
+  LD4(src_y + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_SB(srcd0, srcd1, src2);
+  INSERT_D2_SB(srcd2, srcd3, src3);
+
+  filt = LD_SH(y_filter);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  UNPCK_SB_SH(src0, src0_h, src1_h);
+  UNPCK_SB_SH(src1, src2_h, src3_h);
+  UNPCK_SB_SH(src2, src4_h, src5_h);
+  UNPCK_SB_SH(src3, src6_h, src7_h);
+
+  src0_h *= filt0;
+  src4_h *= filt4;
+  src0_h += src1_h * filt1;
+  src4_h += src5_h * filt5;
+  src0_h += src2_h * filt2;
+  src4_h += src6_h * filt6;
+  src0_h += src3_h * filt3;
+  src4_h += src7_h * filt7;
+
+  src0_h = __msa_adds_s_h(src0_h, src4_h);
+  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+  src0_h = __msa_sat_s_h(src0_h, 7);
+  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+  ST8x1_UB(dst0, dst);
+}
+
+static void filter_vert_mul_w16_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+                                    uint8_t *dst, const int16_t *y_filter,
+                                    int w) {
+  int x;
+  v16u8 dst0;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+  v8i16 src8_h, src9_h, src10_h, src11_h, src12_h, src13_h, src14_h, src15_h;
+  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+  filt = LD_SH(y_filter);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+  for (x = 0; x < w; x += 16) {
+    LD_SB8(src_y, src_pitch, src0, src1, src2, src3, src4, src5, src6, src7);
+    src_y += 16;
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    XORI_B4_128_SB(src4, src5, src6, src7);
+    UNPCK_SB_SH(src0, src0_h, src1_h);
+    UNPCK_SB_SH(src1, src2_h, src3_h);
+    UNPCK_SB_SH(src2, src4_h, src5_h);
+    UNPCK_SB_SH(src3, src6_h, src7_h);
+    UNPCK_SB_SH(src4, src8_h, src9_h);
+    UNPCK_SB_SH(src5, src10_h, src11_h);
+    UNPCK_SB_SH(src6, src12_h, src13_h);
+    UNPCK_SB_SH(src7, src14_h, src15_h);
+
+    src0_h *= filt0;
+    src1_h *= filt0;
+    src8_h *= filt4;
+    src9_h *= filt4;
+    src0_h += src2_h * filt1;
+    src1_h += src3_h * filt1;
+    src8_h += src10_h * filt5;
+    src9_h += src11_h * filt5;
+    src0_h += src4_h * filt2;
+    src1_h += src5_h * filt2;
+    src8_h += src12_h * filt6;
+    src9_h += src13_h * filt6;
+    src0_h += src6_h * filt3;
+    src1_h += src7_h * filt3;
+    src8_h += src14_h * filt7;
+    src9_h += src15_h * filt7;
+
+    ADDS_SH2_SH(src0_h, src8_h, src1_h, src9_h, src0_h, src1_h);
+    SRARI_H2_SH(src0_h, src1_h, FILTER_BITS);
+    SAT_SH2_SH(src0_h, src1_h, 7);
+    dst0 = PCKEV_XORI128_UB(src0_h, src1_h);
+    ST_UB(dst0, dst);
+    dst += 16;
+  }
+}
+
+static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *y_filters, int y0_q4,
+                                   int y_step_q4, int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (y = 0; y < h; ++y) {
+    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w4_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      uint32_t srcd = LW(src_y + 3 * src_stride);
+      SW(srcd, dst + y * dst_stride);
+    }
+
+    y_q4 += y_step_q4;
+  }
+}
+
+static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *y_filters, int y0_q4,
+                                   int y_step_q4, int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (y = 0; y < h; ++y) {
+    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w8_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      uint64_t srcd = LD(src_y + 3 * src_stride);
+      SD(srcd, dst + y * dst_stride);
+    }
+
+    y_q4 += y_step_q4;
+  }
+}
+
+static void scaledconvolve_vert_mul16(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *y_filters, int y0_q4,
+                                      int y_step_q4, int w, int h) {
+  int x, y;
+  int y_q4 = y0_q4;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (y = 0; y < h; ++y) {
+    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_mul_w16_msa(src_y, src_stride, &dst[y * dst_stride], y_filter,
+                              w);
+    } else {
+      for (x = 0; x < w; ++x) {
+        dst[x + y * dst_stride] = src_y[x + 3 * src_stride];
+      }
+    }
+
+    y_q4 += y_step_q4;
+  }
+}
+
+void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                       int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if ((0 == x0_q4) && (16 == x_step_q4) && (0 == y0_q4) && (16 == y_step_q4)) {
+    vpx_convolve_copy_msa(src, src_stride, dst, dst_stride, filter, x0_q4,
+                          x_step_q4, y0_q4, y_step_q4, w, h);
+  } else {
+    if (w >= 16) {
+      scaledconvolve_horiz_mul16(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                                 src_stride, temp, 64, filter, x0_q4, x_step_q4,
+                                 w, intermediate_height);
+    } else if (w == 8) {
+      scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                              src_stride, temp, 64, filter, x0_q4, x_step_q4,
+                              intermediate_height);
+    } else {
+      scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                              src_stride, temp, 64, filter, x0_q4, x_step_q4,
+                              intermediate_height);
+    }
+
+    if (w >= 16) {
+      scaledconvolve_vert_mul16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                                dst_stride, filter, y0_q4, y_step_q4, w, h);
+    } else if (w == 8) {
+      scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                             dst_stride, filter, y0_q4, y_step_q4, h);
+    } else {
+      scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                             dst_stride, filter, y0_q4, y_step_q4, h);
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
index 410682271f..13fce0077c 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -628,9 +628,10 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   int8_t cnt, filt_ver[8];
 
   assert(y_step_q4 == 16);
@@ -663,8 +664,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                              &filt_ver[3], h);
         break;
       default:
-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else {
@@ -690,8 +691,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                              filt_ver, h);
         break;
       default:
-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
index 45399bad85..ce649935da 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
 static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
@@ -188,13 +189,14 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int32_t filter_x_stride,
-                          const int16_t *filter_y, int32_t filter_y_stride,
+                          const InterpKernel *filter, int x0_q4,
+                          int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
                           int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   switch (w) {
     case 4: {
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
index c3d87a4ab8..c2ab33a2f4 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
@@ -9,6 +9,7 @@
  */
 
 #include <string.h>
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
@@ -198,13 +199,14 @@ static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int32_t filter_x_stride,
-                           const int16_t *filter_y, int32_t filter_y_stride,
+                           const InterpKernel *filter, int x0_q4,
+                           int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
                            int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   switch (w) {
     case 4: {
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
index 198c21ed20..d53244596b 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
@@ -16,18 +16,18 @@
 
 extern const uint8_t mc_filt_mask_arr[16 * 3];
 
-#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \
-                            filt3)                                       \
-  ({                                                                     \
-    v8i16 tmp0, tmp1;                                                    \
-                                                                         \
-    tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);                    \
-    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1);             \
-    tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);                    \
-    tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3);             \
-    tmp0 = __msa_adds_s_h(tmp0, tmp1);                                   \
-                                                                         \
-    tmp0;                                                                \
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2,   \
+                            filt3)                                         \
+  ({                                                                       \
+    v8i16 tmp_dpadd_0, tmp_dpadd_1;                                        \
+                                                                           \
+    tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);               \
+    tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \
+    tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);               \
+    tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \
+    tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1);                \
+                                                                           \
+    tmp_dpadd_0;                                                           \
   })
 
 #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0,       \
@@ -110,15 +110,13 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
     ST_UB(tmp_m, (pdst));                                 \
   }
 
-#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \
-                           stride)                                           \
-  {                                                                          \
-    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                                     \
-                                                                             \
-    PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                         \
-    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
-    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);             \
-    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                                \
+#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \
+  {                                                                      \
+    v16u8 tmp0_m, tmp1_m;                                                \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                                 \
+                                                                         \
+    PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m);                     \
+    AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);             \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                            \
   }
 #endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
diff --git a/libs/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h b/libs/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h
new file mode 100644
index 0000000000..2c5d9a4f6a
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
+#define VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE int16x8_t load_tran_low(int32_t c, const tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  int32x4_t u = vec_vsx_ld(c, s);
+  int32x4_t v = vec_vsx_ld(c, s + 4);
+  return vec_packs(u, v);
+#else
+  return vec_vsx_ld(c, s);
+#endif
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(int16x8_t v, int32_t c, tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16x8_t one = vec_splat_s16(1);
+  const int32x4_t even = vec_mule(v, one);
+  const int32x4_t odd = vec_mulo(v, one);
+  const int32x4_t high = vec_mergeh(even, odd);
+  const int32x4_t low = vec_mergel(even, odd);
+  vec_vsx_st(high, c, s);
+  vec_vsx_st(low, c, s + 4);
+#else
+  vec_vsx_st(v, c, s);
+#endif
+}
+
+#endif  // VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
diff --git a/libs/libvpx/vpx_dsp/ppc/hadamard_vsx.c b/libs/libvpx/vpx_dsp/ppc/hadamard_vsx.c
new file mode 100644
index 0000000000..e279b30478
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/hadamard_vsx.c
@@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+#include "vpx_dsp/ppc/transpose_vsx.h"
+#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
+
+static void vpx_hadamard_s16_8x8_one_pass(int16x8_t v[8]) {
+  const int16x8_t b0 = vec_add(v[0], v[1]);
+  const int16x8_t b1 = vec_sub(v[0], v[1]);
+  const int16x8_t b2 = vec_add(v[2], v[3]);
+  const int16x8_t b3 = vec_sub(v[2], v[3]);
+  const int16x8_t b4 = vec_add(v[4], v[5]);
+  const int16x8_t b5 = vec_sub(v[4], v[5]);
+  const int16x8_t b6 = vec_add(v[6], v[7]);
+  const int16x8_t b7 = vec_sub(v[6], v[7]);
+
+  const int16x8_t c0 = vec_add(b0, b2);
+  const int16x8_t c1 = vec_add(b1, b3);
+  const int16x8_t c2 = vec_sub(b0, b2);
+  const int16x8_t c3 = vec_sub(b1, b3);
+  const int16x8_t c4 = vec_add(b4, b6);
+  const int16x8_t c5 = vec_add(b5, b7);
+  const int16x8_t c6 = vec_sub(b4, b6);
+  const int16x8_t c7 = vec_sub(b5, b7);
+
+  v[0] = vec_add(c0, c4);
+  v[1] = vec_sub(c2, c6);
+  v[2] = vec_sub(c0, c4);
+  v[3] = vec_add(c2, c6);
+  v[4] = vec_add(c3, c7);
+  v[5] = vec_sub(c3, c7);
+  v[6] = vec_sub(c1, c5);
+  v[7] = vec_add(c1, c5);
+}
+
+void vpx_hadamard_8x8_vsx(const int16_t *src_diff, ptrdiff_t src_stride,
+                          tran_low_t *coeff) {
+  int16x8_t v[8];
+
+  v[0] = vec_vsx_ld(0, src_diff);
+  v[1] = vec_vsx_ld(0, src_diff + src_stride);
+  v[2] = vec_vsx_ld(0, src_diff + (2 * src_stride));
+  v[3] = vec_vsx_ld(0, src_diff + (3 * src_stride));
+  v[4] = vec_vsx_ld(0, src_diff + (4 * src_stride));
+  v[5] = vec_vsx_ld(0, src_diff + (5 * src_stride));
+  v[6] = vec_vsx_ld(0, src_diff + (6 * src_stride));
+  v[7] = vec_vsx_ld(0, src_diff + (7 * src_stride));
+
+  vpx_hadamard_s16_8x8_one_pass(v);
+
+  vpx_transpose_s16_8x8(v);
+
+  vpx_hadamard_s16_8x8_one_pass(v);
+
+  store_tran_low(v[0], 0, coeff);
+  store_tran_low(v[1], 0, coeff + 8);
+  store_tran_low(v[2], 0, coeff + 16);
+  store_tran_low(v[3], 0, coeff + 24);
+  store_tran_low(v[4], 0, coeff + 32);
+  store_tran_low(v[5], 0, coeff + 40);
+  store_tran_low(v[6], 0, coeff + 48);
+  store_tran_low(v[7], 0, coeff + 56);
+}
+
+void vpx_hadamard_16x16_vsx(const int16_t *src_diff, ptrdiff_t src_stride,
+                            tran_low_t *coeff) {
+  int i;
+  const uint16x8_t ones = vec_splat_u16(1);
+
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  vpx_hadamard_8x8_vsx(src_diff, src_stride, coeff);
+  /* Top right. */
+  vpx_hadamard_8x8_vsx(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+  /* Bottom left. */
+  vpx_hadamard_8x8_vsx(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+  /* Bottom right. */
+  vpx_hadamard_8x8_vsx(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+  /* Overlay the 8x8 blocks and combine. */
+  for (i = 0; i < 64; i += 8) {
+    const int16x8_t a0 = load_tran_low(0, coeff);
+    const int16x8_t a1 = load_tran_low(0, coeff + 64);
+    const int16x8_t a2 = load_tran_low(0, coeff + 128);
+    const int16x8_t a3 = load_tran_low(0, coeff + 192);
+
+    /* Prevent the result from escaping int16_t. */
+    const int16x8_t b0 = vec_sra(a0, ones);
+    const int16x8_t b1 = vec_sra(a1, ones);
+    const int16x8_t b2 = vec_sra(a2, ones);
+    const int16x8_t b3 = vec_sra(a3, ones);
+
+    const int16x8_t c0 = vec_add(b0, b1);
+    const int16x8_t c2 = vec_add(b2, b3);
+    const int16x8_t c1 = vec_sub(b0, b1);
+    const int16x8_t c3 = vec_sub(b2, b3);
+
+    const int16x8_t d0 = vec_add(c0, c2);
+    const int16x8_t d1 = vec_add(c1, c3);
+    const int16x8_t d2 = vec_sub(c0, c2);
+    const int16x8_t d3 = vec_sub(c1, c3);
+
+    store_tran_low(d0, 0, coeff);
+    store_tran_low(d1, 0, coeff + 64);
+    store_tran_low(d2, 0, coeff + 128);
+    store_tran_low(d3, 0, coeff + 192);
+
+    coeff += 8;
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c b/libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c
new file mode 100644
index 0000000000..6273460f19
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c
@@ -0,0 +1,749 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d = vec_vsx_ld(0, above);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 16; i++, dst += stride) {
+    vec_vsx_st(d, 0, dst);
+  }
+}
+
+void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vec_vsx_ld(0, above);
+  const uint8x16_t d1 = vec_vsx_ld(16, above);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 32; i++, dst += stride) {
+    vec_vsx_st(d0, 0, dst);
+    vec_vsx_st(d1, 16, dst);
+  }
+}
+
+static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+
+void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d = vec_vsx_ld(0, left);
+  const uint8x16_t v0 = vec_splat(d, 0);
+  const uint8x16_t v1 = vec_splat(d, 1);
+  const uint8x16_t v2 = vec_splat(d, 2);
+  const uint8x16_t v3 = vec_splat(d, 3);
+
+  (void)above;
+
+  vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+  dst += stride;
+  vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+  dst += stride;
+  vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+  dst += stride;
+  vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+}
+
+void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d = vec_vsx_ld(0, left);
+  const uint8x16_t v0 = vec_splat(d, 0);
+  const uint8x16_t v1 = vec_splat(d, 1);
+  const uint8x16_t v2 = vec_splat(d, 2);
+  const uint8x16_t v3 = vec_splat(d, 3);
+
+  const uint8x16_t v4 = vec_splat(d, 4);
+  const uint8x16_t v5 = vec_splat(d, 5);
+  const uint8x16_t v6 = vec_splat(d, 6);
+  const uint8x16_t v7 = vec_splat(d, 7);
+
+  (void)above;
+
+  vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst);
+  dst += stride;
+  vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst);
+  dst += stride;
+  vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst);
+  dst += stride;
+  vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst);
+  dst += stride;
+  vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst);
+  dst += stride;
+  vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst);
+  dst += stride;
+  vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst);
+  dst += stride;
+  vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst);
+}
+
+void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d = vec_vsx_ld(0, left);
+  const uint8x16_t v0 = vec_splat(d, 0);
+  const uint8x16_t v1 = vec_splat(d, 1);
+  const uint8x16_t v2 = vec_splat(d, 2);
+  const uint8x16_t v3 = vec_splat(d, 3);
+
+  const uint8x16_t v4 = vec_splat(d, 4);
+  const uint8x16_t v5 = vec_splat(d, 5);
+  const uint8x16_t v6 = vec_splat(d, 6);
+  const uint8x16_t v7 = vec_splat(d, 7);
+
+  const uint8x16_t v8 = vec_splat(d, 8);
+  const uint8x16_t v9 = vec_splat(d, 9);
+  const uint8x16_t v10 = vec_splat(d, 10);
+  const uint8x16_t v11 = vec_splat(d, 11);
+
+  const uint8x16_t v12 = vec_splat(d, 12);
+  const uint8x16_t v13 = vec_splat(d, 13);
+  const uint8x16_t v14 = vec_splat(d, 14);
+  const uint8x16_t v15 = vec_splat(d, 15);
+
+  (void)above;
+
+  vec_vsx_st(v0, 0, dst);
+  dst += stride;
+  vec_vsx_st(v1, 0, dst);
+  dst += stride;
+  vec_vsx_st(v2, 0, dst);
+  dst += stride;
+  vec_vsx_st(v3, 0, dst);
+  dst += stride;
+  vec_vsx_st(v4, 0, dst);
+  dst += stride;
+  vec_vsx_st(v5, 0, dst);
+  dst += stride;
+  vec_vsx_st(v6, 0, dst);
+  dst += stride;
+  vec_vsx_st(v7, 0, dst);
+  dst += stride;
+  vec_vsx_st(v8, 0, dst);
+  dst += stride;
+  vec_vsx_st(v9, 0, dst);
+  dst += stride;
+  vec_vsx_st(v10, 0, dst);
+  dst += stride;
+  vec_vsx_st(v11, 0, dst);
+  dst += stride;
+  vec_vsx_st(v12, 0, dst);
+  dst += stride;
+  vec_vsx_st(v13, 0, dst);
+  dst += stride;
+  vec_vsx_st(v14, 0, dst);
+  dst += stride;
+  vec_vsx_st(v15, 0, dst);
+}
+
+#define H_PREDICTOR_32(v) \
+  vec_vsx_st(v, 0, dst);  \
+  vec_vsx_st(v, 16, dst); \
+  dst += stride
+
+void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vec_vsx_ld(0, left);
+  const uint8x16_t d1 = vec_vsx_ld(16, left);
+
+  const uint8x16_t v0_0 = vec_splat(d0, 0);
+  const uint8x16_t v1_0 = vec_splat(d0, 1);
+  const uint8x16_t v2_0 = vec_splat(d0, 2);
+  const uint8x16_t v3_0 = vec_splat(d0, 3);
+  const uint8x16_t v4_0 = vec_splat(d0, 4);
+  const uint8x16_t v5_0 = vec_splat(d0, 5);
+  const uint8x16_t v6_0 = vec_splat(d0, 6);
+  const uint8x16_t v7_0 = vec_splat(d0, 7);
+  const uint8x16_t v8_0 = vec_splat(d0, 8);
+  const uint8x16_t v9_0 = vec_splat(d0, 9);
+  const uint8x16_t v10_0 = vec_splat(d0, 10);
+  const uint8x16_t v11_0 = vec_splat(d0, 11);
+  const uint8x16_t v12_0 = vec_splat(d0, 12);
+  const uint8x16_t v13_0 = vec_splat(d0, 13);
+  const uint8x16_t v14_0 = vec_splat(d0, 14);
+  const uint8x16_t v15_0 = vec_splat(d0, 15);
+
+  const uint8x16_t v0_1 = vec_splat(d1, 0);
+  const uint8x16_t v1_1 = vec_splat(d1, 1);
+  const uint8x16_t v2_1 = vec_splat(d1, 2);
+  const uint8x16_t v3_1 = vec_splat(d1, 3);
+  const uint8x16_t v4_1 = vec_splat(d1, 4);
+  const uint8x16_t v5_1 = vec_splat(d1, 5);
+  const uint8x16_t v6_1 = vec_splat(d1, 6);
+  const uint8x16_t v7_1 = vec_splat(d1, 7);
+  const uint8x16_t v8_1 = vec_splat(d1, 8);
+  const uint8x16_t v9_1 = vec_splat(d1, 9);
+  const uint8x16_t v10_1 = vec_splat(d1, 10);
+  const uint8x16_t v11_1 = vec_splat(d1, 11);
+  const uint8x16_t v12_1 = vec_splat(d1, 12);
+  const uint8x16_t v13_1 = vec_splat(d1, 13);
+  const uint8x16_t v14_1 = vec_splat(d1, 14);
+  const uint8x16_t v15_1 = vec_splat(d1, 15);
+
+  (void)above;
+
+  H_PREDICTOR_32(v0_0);
+  H_PREDICTOR_32(v1_0);
+  H_PREDICTOR_32(v2_0);
+  H_PREDICTOR_32(v3_0);
+
+  H_PREDICTOR_32(v4_0);
+  H_PREDICTOR_32(v5_0);
+  H_PREDICTOR_32(v6_0);
+  H_PREDICTOR_32(v7_0);
+
+  H_PREDICTOR_32(v8_0);
+  H_PREDICTOR_32(v9_0);
+  H_PREDICTOR_32(v10_0);
+  H_PREDICTOR_32(v11_0);
+
+  H_PREDICTOR_32(v12_0);
+  H_PREDICTOR_32(v13_0);
+  H_PREDICTOR_32(v14_0);
+  H_PREDICTOR_32(v15_0);
+
+  H_PREDICTOR_32(v0_1);
+  H_PREDICTOR_32(v1_1);
+  H_PREDICTOR_32(v2_1);
+  H_PREDICTOR_32(v3_1);
+
+  H_PREDICTOR_32(v4_1);
+  H_PREDICTOR_32(v5_1);
+  H_PREDICTOR_32(v6_1);
+  H_PREDICTOR_32(v7_1);
+
+  H_PREDICTOR_32(v8_1);
+  H_PREDICTOR_32(v9_1);
+  H_PREDICTOR_32(v10_1);
+  H_PREDICTOR_32(v11_1);
+
+  H_PREDICTOR_32(v12_1);
+  H_PREDICTOR_32(v13_1);
+  H_PREDICTOR_32(v14_1);
+  H_PREDICTOR_32(v15_1);
+}
+
+void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+  const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
+  const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
+  int16x8_t tmp, val;
+  uint8x16_t d;
+
+  d = vec_vsx_ld(0, dst);
+  tmp = unpack_to_s16_l(d);
+  val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
+  vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+  dst += stride;
+
+  d = vec_vsx_ld(0, dst);
+  tmp = unpack_to_s16_l(d);
+  val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
+  vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+  dst += stride;
+
+  d = vec_vsx_ld(0, dst);
+  tmp = unpack_to_s16_l(d);
+  val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
+  vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+  dst += stride;
+
+  d = vec_vsx_ld(0, dst);
+  tmp = unpack_to_s16_l(d);
+  val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
+  vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+}
+
+void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+  const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
+  const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
+  int16x8_t tmp, val;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+  dst += stride;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+  dst += stride;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+  dst += stride;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+  dst += stride;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 4), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+  dst += stride;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 5), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+  dst += stride;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 6), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+  dst += stride;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 7), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+}
+
+static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l,
+                              int16x8_t ah, int16x8_t al, int16x8_t tl) {
+  int16x8_t vh, vl, ls;
+
+  ls = vec_splat(l, 0);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  dst += stride;
+
+  ls = vec_splat(l, 1);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  dst += stride;
+
+  ls = vec_splat(l, 2);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  dst += stride;
+
+  ls = vec_splat(l, 3);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  dst += stride;
+
+  ls = vec_splat(l, 4);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  dst += stride;
+
+  ls = vec_splat(l, 5);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  dst += stride;
+
+  ls = vec_splat(l, 6);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  dst += stride;
+
+  ls = vec_splat(l, 7);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+}
+
+void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+  const uint8x16_t l = vec_vsx_ld(0, left);
+  const int16x8_t lh = unpack_to_s16_h(l);
+  const int16x8_t ll = unpack_to_s16_l(l);
+  const uint8x16_t a = vec_vsx_ld(0, above);
+  const int16x8_t ah = unpack_to_s16_h(a);
+  const int16x8_t al = unpack_to_s16_l(a);
+
+  tm_predictor_16x8(dst, stride, lh, ah, al, tl);
+
+  dst += stride * 8;
+
+  tm_predictor_16x8(dst, stride, ll, ah, al, tl);
+}
+
+static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls,
+                                     const int16x8_t a0h, const int16x8_t a0l,
+                                     const int16x8_t a1h, const int16x8_t a1l,
+                                     const int16x8_t tl) {
+  int16x8_t vh, vl;
+
+  vh = vec_sub(vec_add(ls, a0h), tl);
+  vl = vec_sub(vec_add(ls, a0l), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  vh = vec_sub(vec_add(ls, a1h), tl);
+  vl = vec_sub(vec_add(ls, a1l), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 16, dst);
+}
+
+static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride,
+                              const int16x8_t l, const uint8x16_t a0,
+                              const uint8x16_t a1, const int16x8_t tl) {
+  const int16x8_t a0h = unpack_to_s16_h(a0);
+  const int16x8_t a0l = unpack_to_s16_l(a0);
+  const int16x8_t a1h = unpack_to_s16_h(a1);
+  const int16x8_t a1l = unpack_to_s16_l(a1);
+
+  tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl);
+  dst += stride;
+
+  tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl);
+  dst += stride;
+
+  tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl);
+  dst += stride;
+
+  tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl);
+  dst += stride;
+
+  tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl);
+  dst += stride;
+
+  tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl);
+  dst += stride;
+
+  tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl);
+  dst += stride;
+
+  tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl);
+}
+
+void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+  const uint8x16_t l0 = vec_vsx_ld(0, left);
+  const uint8x16_t l1 = vec_vsx_ld(16, left);
+  const uint8x16_t a0 = vec_vsx_ld(0, above);
+  const uint8x16_t a1 = vec_vsx_ld(16, above);
+
+  tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl);
+  dst += stride * 8;
+
+  tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl);
+  dst += stride * 8;
+
+  tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl);
+  dst += stride * 8;
+
+  tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl);
+}
+
+static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride,
+                                         const uint8x16_t val) {
+  int i;
+
+  for (i = 0; i < 8; i++, dst += stride) {
+    const uint8x16_t d = vec_vsx_ld(0, dst);
+    vec_vsx_st(xxpermdi(val, d, 1), 0, dst);
+  }
+}
+
+static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride,
+                                           const uint8x16_t val) {
+  int i;
+
+  for (i = 0; i < 16; i++, dst += stride) {
+    vec_vsx_st(val, 0, dst);
+  }
+}
+
+void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
+  (void)above;
+  (void)left;
+
+  dc_fill_predictor_16x16(dst, stride, v128);
+}
+
+static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride,
+                                           const uint8x16_t val) {
+  int i;
+
+  for (i = 0; i < 32; i++, dst += stride) {
+    vec_vsx_st(val, 0, dst);
+    vec_vsx_st(val, 16, dst);
+  }
+}
+
+void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
+  (void)above;
+  (void)left;
+
+  dc_fill_predictor_32x32(dst, stride, v128);
+}
+
+static uint8x16_t avg16(const uint8_t *values) {
+  const int32x4_t sum4s =
+      (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0));
+  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8));
+  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
+
+  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+                   3);
+}
+
+void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  dc_fill_predictor_16x16(dst, stride, avg16(left));
+}
+
+void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  dc_fill_predictor_16x16(dst, stride, avg16(above));
+}
+
+static uint8x16_t avg32(const uint8_t *values) {
+  const uint8x16_t v0 = vec_vsx_ld(0, values);
+  const uint8x16_t v1 = vec_vsx_ld(16, values);
+  const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
+  const int32x4_t sum4s =
+      (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0)));
+  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
+  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
+
+  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+                   3);
+}
+
+void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  dc_fill_predictor_32x32(dst, stride, avg32(left));
+}
+
+void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  dc_fill_predictor_32x32(dst, stride, avg32(above));
+}
+
+static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t a0 = vec_vsx_ld(0, above);
+  const uint8x16_t l0 = vec_vsx_ld(0, left);
+  const int32x4_t sum4s =
+      (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
+  const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1);
+  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8));
+  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
+
+  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+                   3);
+}
+
+static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t a0 = vec_vsx_ld(0, above);
+  const uint8x16_t l0 = vec_vsx_ld(0, left);
+  const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
+  const int32x4_t sum4s =
+      (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
+  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
+  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
+
+  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+                   3);
+}
+
+void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left));
+}
+
+void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left));
+}
+
+static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t a0 = vec_vsx_ld(0, above);
+  const uint8x16_t a1 = vec_vsx_ld(16, above);
+  const uint8x16_t l0 = vec_vsx_ld(0, left);
+  const uint8x16_t l1 = vec_vsx_ld(16, left);
+  const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5));
+  const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0)));
+  const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum));
+  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32);
+  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6));
+
+  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+                   3);
+}
+
+void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left));
+}
+
+static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b,
+                       const uint8x16_t c) {
+  const uint8x16_t ac =
+      vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1)));
+
+  return vec_avg(ac, b);
+}
+
+// Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken.
+static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+                                0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 };
+
+void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t af = vec_vsx_ld(0, above);
+  const uint8x16_t above_right = vec_splat(af, 7);
+  const uint8x16_t a = xxpermdi(af, above_right, 1);
+  const uint8x16_t b = vec_perm(a, above_right, sl1);
+  const uint8x16_t c = vec_perm(b, above_right, sl1);
+  uint8x16_t row = avg3(a, b, c);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 8; i++) {
+    const uint8x16_t d = vec_vsx_ld(0, dst);
+    vec_vsx_st(xxpermdi(row, d, 1), 0, dst);
+    dst += stride;
+    row = vec_perm(row, above_right, sl1);
+  }
+}
+
+void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t a = vec_vsx_ld(0, above);
+  const uint8x16_t above_right = vec_splat(a, 15);
+  const uint8x16_t b = vec_perm(a, above_right, sl1);
+  const uint8x16_t c = vec_perm(b, above_right, sl1);
+  uint8x16_t row = avg3(a, b, c);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 16; i++) {
+    vec_vsx_st(row, 0, dst);
+    dst += stride;
+    row = vec_perm(row, above_right, sl1);
+  }
+}
+
+void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t a0 = vec_vsx_ld(0, above);
+  const uint8x16_t a1 = vec_vsx_ld(16, above);
+  const uint8x16_t above_right = vec_splat(a1, 15);
+  const uint8x16_t b0 = vec_perm(a0, a1, sl1);
+  const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
+  const uint8x16_t c0 = vec_perm(b0, b1, sl1);
+  const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
+  uint8x16_t row0 = avg3(a0, b0, c0);
+  uint8x16_t row1 = avg3(a1, b1, c1);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 32; i++) {
+    vec_vsx_st(row0, 0, dst);
+    vec_vsx_st(row1, 16, dst);
+    dst += stride;
+    row0 = vec_perm(row0, row1, sl1);
+    row1 = vec_perm(row1, above_right, sl1);
+  }
+}
+
+void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t af = vec_vsx_ld(0, above);
+  const uint8x16_t above_right = vec_splat(af, 9);
+  const uint8x16_t a = xxpermdi(af, above_right, 1);
+  const uint8x16_t b = vec_perm(a, above_right, sl1);
+  const uint8x16_t c = vec_perm(b, above_right, sl1);
+  uint8x16_t row0 = vec_avg(a, b);
+  uint8x16_t row1 = avg3(a, b, c);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 4; i++) {
+    const uint8x16_t d0 = vec_vsx_ld(0, dst);
+    const uint8x16_t d1 = vec_vsx_ld(0, dst + stride);
+    vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst);
+    vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride);
+    dst += stride * 2;
+    row0 = vec_perm(row0, above_right, sl1);
+    row1 = vec_perm(row1, above_right, sl1);
+  }
+}
+
+void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t a0 = vec_vsx_ld(0, above);
+  const uint8x16_t a1 = vec_vsx_ld(16, above);
+  const uint8x16_t above_right = vec_splat(a1, 0);
+  const uint8x16_t b = vec_perm(a0, above_right, sl1);
+  const uint8x16_t c = vec_perm(b, above_right, sl1);
+  uint8x16_t row0 = vec_avg(a0, b);
+  uint8x16_t row1 = avg3(a0, b, c);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 8; i++) {
+    vec_vsx_st(row0, 0, dst);
+    vec_vsx_st(row1, 0, dst + stride);
+    dst += stride * 2;
+    row0 = vec_perm(row0, above_right, sl1);
+    row1 = vec_perm(row1, above_right, sl1);
+  }
+}
+
+void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t a0 = vec_vsx_ld(0, above);
+  const uint8x16_t a1 = vec_vsx_ld(16, above);
+  const uint8x16_t a2 = vec_vsx_ld(32, above);
+  const uint8x16_t above_right = vec_splat(a2, 0);
+  const uint8x16_t b0 = vec_perm(a0, a1, sl1);
+  const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
+  const uint8x16_t c0 = vec_perm(b0, b1, sl1);
+  const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
+  uint8x16_t row0_0 = vec_avg(a0, b0);
+  uint8x16_t row0_1 = vec_avg(a1, b1);
+  uint8x16_t row1_0 = avg3(a0, b0, c0);
+  uint8x16_t row1_1 = avg3(a1, b1, c1);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 16; i++) {
+    vec_vsx_st(row0_0, 0, dst);
+    vec_vsx_st(row0_1, 16, dst);
+    vec_vsx_st(row1_0, 0, dst + stride);
+    vec_vsx_st(row1_1, 16, dst + stride);
+    dst += stride * 2;
+    row0_0 = vec_perm(row0_0, row0_1, sl1);
+    row0_1 = vec_perm(row0_1, above_right, sl1);
+    row1_0 = vec_perm(row1_0, row1_1, sl1);
+    row1_1 = vec_perm(row1_1, above_right, sl1);
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c b/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c
new file mode 100644
index 0000000000..d43a9fd184
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c
@@ -0,0 +1,1063 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static int16x8_t cospi1_v = { 16364, 16364, 16364, 16364,
+                              16364, 16364, 16364, 16364 };
+static int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
+                              16305, 16305, 16305, 16305 };
+static int16x8_t cospi3_v = { 16207, 16207, 16207, 16207,
+                              16207, 16207, 16207, 16207 };
+static int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
+                              16069, 16069, 16069, 16069 };
+static int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069,
+                               -16069, -16069, -16069, -16069 };
+static int16x8_t cospi5_v = { 15893, 15893, 15893, 15893,
+                              15893, 15893, 15893, 15893 };
+static int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
+                              15679, 15679, 15679, 15679 };
+static int16x8_t cospi7_v = { 15426, 15426, 15426, 15426,
+                              15426, 15426, 15426, 15426 };
+static int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
+                              15137, 15137, 15137, 15137 };
+static int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137,
+                               -15137, -15137, -15137, -15137 };
+static int16x8_t cospi9_v = { 14811, 14811, 14811, 14811,
+                              14811, 14811, 14811, 14811 };
+static int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
+                               14449, 14449, 14449, 14449 };
+static int16x8_t cospi11_v = { 14053, 14053, 14053, 14053,
+                               14053, 14053, 14053, 14053 };
+static int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
+                               13623, 13623, 13623, 13623 };
+static int16x8_t cospi13_v = { 13160, 13160, 13160, 13160,
+                               13160, 13160, 13160, 13160 };
+static int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
+                               12665, 12665, 12665, 12665 };
+static int16x8_t cospi15_v = { 12140, 12140, 12140, 12140,
+                               12140, 12140, 12140, 12140 };
+static int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
+                               11585, 11585, 11585, 11585 };
+static int16x8_t cospi17_v = { 11003, 11003, 11003, 11003,
+                               11003, 11003, 11003, 11003 };
+static int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
+                               10394, 10394, 10394, 10394 };
+static int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, 9760, 9760, 9760, 9760 };
+static int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, 9102, 9102, 9102, 9102 };
+static int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102,
+                                -9102, -9102, -9102, -9102 };
+static int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, 8423, 8423, 8423, 8423 };
+static int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, 7723, 7723, 7723, 7723 };
+static int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, 7005, 7005, 7005, 7005 };
+static int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, 6270, 6270, 6270, 6270 };
+static int16x8_t cospi24_mv = { -6270, -6270, -6270, -6270,
+                                -6270, -6270, -6270, -6270 };
+static int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, 5520, 5520, 5520, 5520 };
+static int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, 4756, 4756, 4756, 4756 };
+static int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, 3981, 3981, 3981, 3981 };
+static int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 };
+static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 };
+static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 };
+static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
+
+#define ROUND_SHIFT_INIT                                               \
+  const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \
+  const uint32x4_t shift14 = vec_splat_u32(14);
+
+#define DCT_CONST_ROUND_SHIFT(vec) vec = vec_sra(vec_add(vec, shift), shift14);
+
+#define PIXEL_ADD_INIT               \
+  int16x8_t add8 = vec_splat_s16(8); \
+  uint16x8_t shift4 = vec_splat_u16(4);
+
+#define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4);
+
+#define IDCT4(in0, in1, out0, out1)                                           \
+  t0 = vec_add(in0, in1);                                                     \
+  t1 = vec_sub(in0, in1);                                                     \
+  tmp16_0 = vec_mergeh(t0, t1);                                               \
+  temp1 = vec_sra(vec_add(vec_mule(tmp16_0, cospi16_v), shift), shift14);     \
+  temp2 = vec_sra(vec_add(vec_mulo(tmp16_0, cospi16_v), shift), shift14);     \
+                                                                              \
+  tmp16_0 = vec_mergel(in0, in1);                                             \
+  temp3 = vec_sub(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
+  DCT_CONST_ROUND_SHIFT(temp3);                                               \
+  temp4 = vec_add(vec_mule(tmp16_0, cospi8_v), vec_mulo(tmp16_0, cospi24_v)); \
+  DCT_CONST_ROUND_SHIFT(temp4);                                               \
+                                                                              \
+  step0 = vec_packs(temp1, temp2);                                            \
+  step1 = vec_packs(temp4, temp3);                                            \
+  out0 = vec_add(step0, step1);                                               \
+  out1 = vec_sub(step0, step1);                                               \
+  out1 = vec_perm(out1, out1, mask0);
+
+void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int32x4_t temp1, temp2, temp3, temp4;
+  int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1;
+  uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
+                       0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 };
+  uint8x16_t mask1 = { 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+                       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 };
+  int16x8_t v0 = load_tran_low(0, input);
+  int16x8_t v1 = load_tran_low(8 * sizeof(*input), input);
+  int16x8_t t0 = vec_mergeh(v0, v1);
+  int16x8_t t1 = vec_mergel(v0, v1);
+
+  uint8x16_t dest0 = vec_vsx_ld(0, dest);
+  uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+  uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+  uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+  uint8x16_t zerov = vec_splat_u8(0);
+  int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
+  int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
+  int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
+  int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
+  uint8x16_t output_v;
+  uint8_t tmp_dest[16];
+  ROUND_SHIFT_INIT
+  PIXEL_ADD_INIT;
+
+  v0 = vec_mergeh(t0, t1);
+  v1 = vec_mergel(t0, t1);
+
+  IDCT4(v0, v1, t_out0, t_out1);
+  // transpose
+  t0 = vec_mergeh(t_out0, t_out1);
+  t1 = vec_mergel(t_out0, t_out1);
+  v0 = vec_mergeh(t0, t1);
+  v1 = vec_mergel(t0, t1);
+  IDCT4(v0, v1, t_out0, t_out1);
+
+  PIXEL_ADD4(v0, t_out0);
+  PIXEL_ADD4(v1, t_out1);
+  tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0);
+  tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1);
+  output_v = vec_packsu(tmp16_0, tmp16_1);
+
+  vec_vsx_st(output_v, 0, tmp_dest);
+  for (int i = 0; i < 4; i++)
+    for (int j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
+}
+
+#define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+                     out3, out4, out5, out6, out7)                             \
+  out0 = vec_mergeh(in0, in1);                                                 \
+  out1 = vec_mergel(in0, in1);                                                 \
+  out2 = vec_mergeh(in2, in3);                                                 \
+  out3 = vec_mergel(in2, in3);                                                 \
+  out4 = vec_mergeh(in4, in5);                                                 \
+  out5 = vec_mergel(in4, in5);                                                 \
+  out6 = vec_mergeh(in6, in7);                                                 \
+  out7 = vec_mergel(in6, in7);                                                 \
+  in0 = (int16x8_t)vec_mergeh((int32x4_t)out0, (int32x4_t)out2);               \
+  in1 = (int16x8_t)vec_mergel((int32x4_t)out0, (int32x4_t)out2);               \
+  in2 = (int16x8_t)vec_mergeh((int32x4_t)out1, (int32x4_t)out3);               \
+  in3 = (int16x8_t)vec_mergel((int32x4_t)out1, (int32x4_t)out3);               \
+  in4 = (int16x8_t)vec_mergeh((int32x4_t)out4, (int32x4_t)out6);               \
+  in5 = (int16x8_t)vec_mergel((int32x4_t)out4, (int32x4_t)out6);               \
+  in6 = (int16x8_t)vec_mergeh((int32x4_t)out5, (int32x4_t)out7);               \
+  in7 = (int16x8_t)vec_mergel((int32x4_t)out5, (int32x4_t)out7);               \
+  out0 = vec_perm(in0, in4, tr8_mask0);                                        \
+  out1 = vec_perm(in0, in4, tr8_mask1);                                        \
+  out2 = vec_perm(in1, in5, tr8_mask0);                                        \
+  out3 = vec_perm(in1, in5, tr8_mask1);                                        \
+  out4 = vec_perm(in2, in6, tr8_mask0);                                        \
+  out5 = vec_perm(in2, in6, tr8_mask1);                                        \
+  out6 = vec_perm(in3, in7, tr8_mask0);                                        \
+  out7 = vec_perm(in3, in7, tr8_mask1);
+
+/* for the: temp1 = step[x] * cospi_q - step[y] * cospi_z
+ *          temp2 = step[x] * cospi_z + step[y] * cospi_q */
+#define STEP8_0(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1)             \
+  tmp16_0 = vec_mergeh(inpt0, inpt1);                                     \
+  tmp16_1 = vec_mergel(inpt0, inpt1);                                     \
+  temp10 = vec_sub(vec_mule(tmp16_0, cospi0), vec_mulo(tmp16_0, cospi1)); \
+  temp11 = vec_sub(vec_mule(tmp16_1, cospi0), vec_mulo(tmp16_1, cospi1)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                          \
+  DCT_CONST_ROUND_SHIFT(temp11);                                          \
+  outpt0 = vec_packs(temp10, temp11);                                     \
+  temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
+  temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                          \
+  DCT_CONST_ROUND_SHIFT(temp11);                                          \
+  outpt1 = vec_packs(temp10, temp11);
+
+#define STEP8_1(inpt0, inpt1, outpt0, outpt1, cospi) \
+  tmp16_2 = vec_sub(inpt0, inpt1);                   \
+  tmp16_3 = vec_add(inpt0, inpt1);                   \
+  tmp16_0 = vec_mergeh(tmp16_2, tmp16_3);            \
+  tmp16_1 = vec_mergel(tmp16_2, tmp16_3);            \
+  temp10 = vec_mule(tmp16_0, cospi);                 \
+  temp11 = vec_mule(tmp16_1, cospi);                 \
+  DCT_CONST_ROUND_SHIFT(temp10);                     \
+  DCT_CONST_ROUND_SHIFT(temp11);                     \
+  outpt0 = vec_packs(temp10, temp11);                \
+  temp10 = vec_mulo(tmp16_0, cospi);                 \
+  temp11 = vec_mulo(tmp16_1, cospi);                 \
+  DCT_CONST_ROUND_SHIFT(temp10);                     \
+  DCT_CONST_ROUND_SHIFT(temp11);                     \
+  outpt1 = vec_packs(temp10, temp11);
+
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7)    \
+  /* stage 1 */                                          \
+  step0 = in0;                                           \
+  step2 = in4;                                           \
+  step1 = in2;                                           \
+  step3 = in6;                                           \
+                                                         \
+  STEP8_0(in1, in7, step4, step7, cospi28_v, cospi4_v);  \
+  STEP8_0(in5, in3, step5, step6, cospi12_v, cospi20_v); \
+                                                         \
+  /* stage 2 */                                          \
+  STEP8_1(step0, step2, in1, in0, cospi16_v);            \
+  STEP8_0(step1, step3, in2, in3, cospi24_v, cospi8_v);  \
+  in4 = vec_add(step4, step5);                           \
+  in5 = vec_sub(step4, step5);                           \
+  in6 = vec_sub(step7, step6);                           \
+  in7 = vec_add(step6, step7);                           \
+                                                         \
+  /* stage 3 */                                          \
+  step0 = vec_add(in0, in3);                             \
+  step1 = vec_add(in1, in2);                             \
+  step2 = vec_sub(in1, in2);                             \
+  step3 = vec_sub(in0, in3);                             \
+  step4 = in4;                                           \
+  STEP8_1(in6, in5, step5, step6, cospi16_v);            \
+  step7 = in7;                                           \
+                                                         \
+  /* stage 4 */                                          \
+  in0 = vec_add(step0, step7);                           \
+  in1 = vec_add(step1, step6);                           \
+  in2 = vec_add(step2, step5);                           \
+  in3 = vec_add(step3, step4);                           \
+  in4 = vec_sub(step3, step4);                           \
+  in5 = vec_sub(step2, step5);                           \
+  in6 = vec_sub(step1, step6);                           \
+  in7 = vec_sub(step0, step7);
+
+#define PIXEL_ADD(in, out, add, shiftx) \
+  out = vec_add(vec_sra(vec_add(in, add), shiftx), out);
+
+static uint8x16_t tr8_mask0 = {
+  0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+};
+static uint8x16_t tr8_mask1 = {
+  0x8,  0x9,  0xA,  0xB,  0xC,  0xD,  0xE,  0xF,
+  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
+};
+void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int32x4_t temp10, temp11;
+  int16x8_t step0, step1, step2, step3, step4, step5, step6, step7;
+  int16x8_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp16_0, tmp16_1,
+      tmp16_2, tmp16_3;
+  int16x8_t src0 = load_tran_low(0, input);
+  int16x8_t src1 = load_tran_low(8 * sizeof(*input), input);
+  int16x8_t src2 = load_tran_low(16 * sizeof(*input), input);
+  int16x8_t src3 = load_tran_low(24 * sizeof(*input), input);
+  int16x8_t src4 = load_tran_low(32 * sizeof(*input), input);
+  int16x8_t src5 = load_tran_low(40 * sizeof(*input), input);
+  int16x8_t src6 = load_tran_low(48 * sizeof(*input), input);
+  int16x8_t src7 = load_tran_low(56 * sizeof(*input), input);
+  uint8x16_t dest0 = vec_vsx_ld(0, dest);
+  uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+  uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+  uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+  uint8x16_t dest4 = vec_vsx_ld(4 * stride, dest);
+  uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest);
+  uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest);
+  uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest);
+  uint8x16_t zerov = vec_splat_u8(0);
+  int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
+  int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
+  int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
+  int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
+  int16x8_t d_u4 = (int16x8_t)vec_mergeh(dest4, zerov);
+  int16x8_t d_u5 = (int16x8_t)vec_mergeh(dest5, zerov);
+  int16x8_t d_u6 = (int16x8_t)vec_mergeh(dest6, zerov);
+  int16x8_t d_u7 = (int16x8_t)vec_mergeh(dest7, zerov);
+  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1));
+  uint16x8_t shift5 = vec_splat_u16(5);
+  uint8x16_t output0, output1, output2, output3;
+  ROUND_SHIFT_INIT;
+
+  TRANSPOSE8x8(src0, src1, src2, src3, src4, src5, src6, src7, tmp0, tmp1, tmp2,
+               tmp3, tmp4, tmp5, tmp6, tmp7);
+
+  IDCT8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  TRANSPOSE8x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, src2,
+               src3, src4, src5, src6, src7);
+  IDCT8(src0, src1, src2, src3, src4, src5, src6, src7);
+  PIXEL_ADD(src0, d_u0, add, shift5);
+  PIXEL_ADD(src1, d_u1, add, shift5);
+  PIXEL_ADD(src2, d_u2, add, shift5);
+  PIXEL_ADD(src3, d_u3, add, shift5);
+  PIXEL_ADD(src4, d_u4, add, shift5);
+  PIXEL_ADD(src5, d_u5, add, shift5);
+  PIXEL_ADD(src6, d_u6, add, shift5);
+  PIXEL_ADD(src7, d_u7, add, shift5);
+  output0 = vec_packsu(d_u0, d_u1);
+  output1 = vec_packsu(d_u2, d_u3);
+  output2 = vec_packsu(d_u4, d_u5);
+  output3 = vec_packsu(d_u6, d_u7);
+
+  vec_vsx_st(xxpermdi(output0, dest0, 1), 0, dest);
+  vec_vsx_st(xxpermdi(output0, dest1, 3), stride, dest);
+  vec_vsx_st(xxpermdi(output1, dest2, 1), 2 * stride, dest);
+  vec_vsx_st(xxpermdi(output1, dest3, 3), 3 * stride, dest);
+  vec_vsx_st(xxpermdi(output2, dest4, 1), 4 * stride, dest);
+  vec_vsx_st(xxpermdi(output2, dest5, 3), 5 * stride, dest);
+  vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest);
+  vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest);
+}
+
+#define LOAD_INPUT16(load, source, offset, step, in0, in1, in2, in3, in4, in5, \
+                     in6, in7, in8, in9, inA, inB, inC, inD, inE, inF)         \
+  in0 = load(offset, source);                                                  \
+  in1 = load((step) + (offset), source);                                       \
+  in2 = load(2 * (step) + (offset), source);                                   \
+  in3 = load(3 * (step) + (offset), source);                                   \
+  in4 = load(4 * (step) + (offset), source);                                   \
+  in5 = load(5 * (step) + (offset), source);                                   \
+  in6 = load(6 * (step) + (offset), source);                                   \
+  in7 = load(7 * (step) + (offset), source);                                   \
+  in8 = load(8 * (step) + (offset), source);                                   \
+  in9 = load(9 * (step) + (offset), source);                                   \
+  inA = load(10 * (step) + (offset), source);                                  \
+  inB = load(11 * (step) + (offset), source);                                  \
+  inC = load(12 * (step) + (offset), source);                                  \
+  inD = load(13 * (step) + (offset), source);                                  \
+  inE = load(14 * (step) + (offset), source);                                  \
+  inF = load(15 * (step) + (offset), source);
+
+#define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \
+  tmp16_0 = vec_mergeh(inpt0, inpt1);                 \
+  tmp16_1 = vec_mergel(inpt0, inpt1);                 \
+  temp10 = vec_mule(tmp16_0, cospi);                  \
+  temp11 = vec_mule(tmp16_1, cospi);                  \
+  temp20 = vec_mulo(tmp16_0, cospi);                  \
+  temp21 = vec_mulo(tmp16_1, cospi);                  \
+  temp30 = vec_sub(temp10, temp20);                   \
+  temp10 = vec_add(temp10, temp20);                   \
+  temp20 = vec_sub(temp11, temp21);                   \
+  temp21 = vec_add(temp11, temp21);                   \
+  DCT_CONST_ROUND_SHIFT(temp30);                      \
+  DCT_CONST_ROUND_SHIFT(temp20);                      \
+  outpt0 = vec_packs(temp30, temp20);                 \
+  DCT_CONST_ROUND_SHIFT(temp10);                      \
+  DCT_CONST_ROUND_SHIFT(temp21);                      \
+  outpt1 = vec_packs(temp10, temp21);
+
+#define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB,     \
+               inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6,   \
+               out7, out8, out9, outA, outB, outC, outD, outE, outF)           \
+  /* stage 1 */                                                                \
+  /* out0 = in0; */                                                            \
+  out1 = in8;                                                                  \
+  out2 = in4;                                                                  \
+  out3 = inC;                                                                  \
+  out4 = in2;                                                                  \
+  out5 = inA;                                                                  \
+  out6 = in6;                                                                  \
+  out7 = inE;                                                                  \
+  out8 = in1;                                                                  \
+  out9 = in9;                                                                  \
+  outA = in5;                                                                  \
+  outB = inD;                                                                  \
+  outC = in3;                                                                  \
+  outD = inB;                                                                  \
+  outE = in7;                                                                  \
+  outF = inF;                                                                  \
+                                                                               \
+  /* stage 2 */                                                                \
+  /* in0 = out0; */                                                            \
+  in1 = out1;                                                                  \
+  in2 = out2;                                                                  \
+  in3 = out3;                                                                  \
+  in4 = out4;                                                                  \
+  in5 = out5;                                                                  \
+  in6 = out6;                                                                  \
+  in7 = out7;                                                                  \
+                                                                               \
+  STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v);                          \
+  STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v);                         \
+  STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v);                         \
+  STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v);                          \
+                                                                               \
+  /* stage 3 */                                                                \
+  out0 = in0;                                                                  \
+  out1 = in1;                                                                  \
+  out2 = in2;                                                                  \
+  out3 = in3;                                                                  \
+                                                                               \
+  STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v);                          \
+  STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v);                         \
+                                                                               \
+  out8 = vec_add(in8, in9);                                                    \
+  out9 = vec_sub(in8, in9);                                                    \
+  outA = vec_sub(inB, inA);                                                    \
+  outB = vec_add(inA, inB);                                                    \
+  outC = vec_add(inC, inD);                                                    \
+  outD = vec_sub(inC, inD);                                                    \
+  outE = vec_sub(inF, inE);                                                    \
+  outF = vec_add(inE, inF);                                                    \
+                                                                               \
+  /* stage 4 */                                                                \
+  STEP16_1(out0, out1, in1, in0, cospi16_v);                                   \
+  STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v);                          \
+  in4 = vec_add(out4, out5);                                                   \
+  in5 = vec_sub(out4, out5);                                                   \
+  in6 = vec_sub(out7, out6);                                                   \
+  in7 = vec_add(out6, out7);                                                   \
+                                                                               \
+  in8 = out8;                                                                  \
+  inF = outF;                                                                  \
+  tmp16_0 = vec_mergeh(out9, outE);                                            \
+  tmp16_1 = vec_mergel(out9, outE);                                            \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                               \
+  DCT_CONST_ROUND_SHIFT(temp11);                                               \
+  in9 = vec_packs(temp10, temp11);                                             \
+  temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
+  temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                               \
+  DCT_CONST_ROUND_SHIFT(temp11);                                               \
+  inE = vec_packs(temp10, temp11);                                             \
+                                                                               \
+  tmp16_0 = vec_mergeh(outA, outD);                                            \
+  tmp16_1 = vec_mergel(outA, outD);                                            \
+  temp10 =                                                                     \
+      vec_sub(vec_mule(tmp16_0, cospi24_mv), vec_mulo(tmp16_0, cospi8_v));     \
+  temp11 =                                                                     \
+      vec_sub(vec_mule(tmp16_1, cospi24_mv), vec_mulo(tmp16_1, cospi8_v));     \
+  DCT_CONST_ROUND_SHIFT(temp10);                                               \
+  DCT_CONST_ROUND_SHIFT(temp11);                                               \
+  inA = vec_packs(temp10, temp11);                                             \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                               \
+  DCT_CONST_ROUND_SHIFT(temp11);                                               \
+  inD = vec_packs(temp10, temp11);                                             \
+                                                                               \
+  inB = outB;                                                                  \
+  inC = outC;                                                                  \
+                                                                               \
+  /* stage 5 */                                                                \
+  out0 = vec_add(in0, in3);                                                    \
+  out1 = vec_add(in1, in2);                                                    \
+  out2 = vec_sub(in1, in2);                                                    \
+  out3 = vec_sub(in0, in3);                                                    \
+  out4 = in4;                                                                  \
+  STEP16_1(in6, in5, out5, out6, cospi16_v);                                   \
+  out7 = in7;                                                                  \
+                                                                               \
+  out8 = vec_add(in8, inB);                                                    \
+  out9 = vec_add(in9, inA);                                                    \
+  outA = vec_sub(in9, inA);                                                    \
+  outB = vec_sub(in8, inB);                                                    \
+  outC = vec_sub(inF, inC);                                                    \
+  outD = vec_sub(inE, inD);                                                    \
+  outE = vec_add(inD, inE);                                                    \
+  outF = vec_add(inC, inF);                                                    \
+                                                                               \
+  /* stage 6 */                                                                \
+  in0 = vec_add(out0, out7);                                                   \
+  in1 = vec_add(out1, out6);                                                   \
+  in2 = vec_add(out2, out5);                                                   \
+  in3 = vec_add(out3, out4);                                                   \
+  in4 = vec_sub(out3, out4);                                                   \
+  in5 = vec_sub(out2, out5);                                                   \
+  in6 = vec_sub(out1, out6);                                                   \
+  in7 = vec_sub(out0, out7);                                                   \
+  in8 = out8;                                                                  \
+  in9 = out9;                                                                  \
+  STEP16_1(outD, outA, inA, inD, cospi16_v);                                   \
+  STEP16_1(outC, outB, inB, inC, cospi16_v);                                   \
+  inE = outE;                                                                  \
+  inF = outF;                                                                  \
+                                                                               \
+  /* stage 7 */                                                                \
+  out0 = vec_add(in0, inF);                                                    \
+  out1 = vec_add(in1, inE);                                                    \
+  out2 = vec_add(in2, inD);                                                    \
+  out3 = vec_add(in3, inC);                                                    \
+  out4 = vec_add(in4, inB);                                                    \
+  out5 = vec_add(in5, inA);                                                    \
+  out6 = vec_add(in6, in9);                                                    \
+  out7 = vec_add(in7, in8);                                                    \
+  out8 = vec_sub(in7, in8);                                                    \
+  out9 = vec_sub(in6, in9);                                                    \
+  outA = vec_sub(in5, inA);                                                    \
+  outB = vec_sub(in4, inB);                                                    \
+  outC = vec_sub(in3, inC);                                                    \
+  outD = vec_sub(in2, inD);                                                    \
+  outE = vec_sub(in1, inE);                                                    \
+  outF = vec_sub(in0, inF);
+
+#define PIXEL_ADD_STORE16(in0, in1, dst, offset) \
+  d_uh = (int16x8_t)vec_mergeh(dst, zerov);      \
+  d_ul = (int16x8_t)vec_mergel(dst, zerov);      \
+  PIXEL_ADD(in0, d_uh, add, shift6);             \
+  PIXEL_ADD(in1, d_ul, add, shift6);             \
+  vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest);
+
+void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  int32x4_t temp10, temp11, temp20, temp21, temp30;
+  int16x8_t src00, src01, src02, src03, src04, src05, src06, src07, src10,
+      src11, src12, src13, src14, src15, src16, src17;
+  int16x8_t src20, src21, src22, src23, src24, src25, src26, src27, src30,
+      src31, src32, src33, src34, src35, src36, src37;
+  int16x8_t tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10,
+      tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp16_0, tmp16_1;
+  int16x8_t tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30,
+      tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37;
+  uint8x16_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8,
+      dest9, destA, destB, destC, destD, destE, destF;
+  int16x8_t d_uh, d_ul;
+  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
+  uint16x8_t shift6 = vec_splat_u16(6);
+  uint8x16_t zerov = vec_splat_u8(0);
+  ROUND_SHIFT_INIT;
+
+  // transform rows
+  // load and transform the upper half of 16x16 matrix
+  LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src00, src10, src01,
+               src11, src02, src12, src03, src13, src04, src14, src05, src15,
+               src06, src16, src07, src17);
+  TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
+               tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
+  TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
+               tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
+  IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, tmp11,
+         tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, src00, src01, src02, src03,
+         src04, src05, src06, src07, src10, src11, src12, src13, src14, src15,
+         src16, src17);
+  TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
+               tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
+  TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
+               tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
+
+  // load and transform the lower half of 16x16 matrix
+  LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input),
+               8 * sizeof(*input), src20, src30, src21, src31, src22, src32,
+               src23, src33, src24, src34, src25, src35, src26, src36, src27,
+               src37);
+  TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
+               tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
+  TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
+               tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
+  IDCT16(tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, tmp31,
+         tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src20, src21, src22, src23,
+         src24, src25, src26, src27, src30, src31, src32, src33, src34, src35,
+         src36, src37);
+  TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
+               tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
+  TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
+               tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
+
+  // transform columns
+  // left half first
+  IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp20, tmp21,
+         tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, src00, src01, src02, src03,
+         src04, src05, src06, src07, src20, src21, src22, src23, src24, src25,
+         src26, src27);
+  // right half
+  IDCT16(tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp30, tmp31,
+         tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src10, src11, src12, src13,
+         src14, src15, src16, src17, src30, src31, src32, src33, src34, src35,
+         src36, src37);
+
+  // load dest
+  LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, dest0, dest1, dest2, dest3, dest4,
+               dest5, dest6, dest7, dest8, dest9, destA, destB, destC, destD,
+               destE, destF);
+
+  PIXEL_ADD_STORE16(src00, src10, dest0, 0);
+  PIXEL_ADD_STORE16(src01, src11, dest1, stride);
+  PIXEL_ADD_STORE16(src02, src12, dest2, 2 * stride);
+  PIXEL_ADD_STORE16(src03, src13, dest3, 3 * stride);
+  PIXEL_ADD_STORE16(src04, src14, dest4, 4 * stride);
+  PIXEL_ADD_STORE16(src05, src15, dest5, 5 * stride);
+  PIXEL_ADD_STORE16(src06, src16, dest6, 6 * stride);
+  PIXEL_ADD_STORE16(src07, src17, dest7, 7 * stride);
+
+  PIXEL_ADD_STORE16(src20, src30, dest8, 8 * stride);
+  PIXEL_ADD_STORE16(src21, src31, dest9, 9 * stride);
+  PIXEL_ADD_STORE16(src22, src32, destA, 10 * stride);
+  PIXEL_ADD_STORE16(src23, src33, destB, 11 * stride);
+  PIXEL_ADD_STORE16(src24, src34, destC, 12 * stride);
+  PIXEL_ADD_STORE16(src25, src35, destD, 13 * stride);
+  PIXEL_ADD_STORE16(src26, src36, destE, 14 * stride);
+  PIXEL_ADD_STORE16(src27, src37, destF, 15 * stride);
+}
+
+#define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \
+                  in21, in22, in23, in30, in31, in32, in33, in40, in41, in42, \
+                  in43, in50, in51, in52, in53, in60, in61, in62, in63, in70, \
+                  in71, in72, in73, offset)                                   \
+  /* load the first row from the 8x32 block*/                                 \
+  in00 = load(offset, input);                                                 \
+  in01 = load(offset + 16, input);                                            \
+  in02 = load(offset + 2 * 16, input);                                        \
+  in03 = load(offset + 3 * 16, input);                                        \
+                                                                              \
+  in10 = load(offset + 4 * 16, input);                                        \
+  in11 = load(offset + 5 * 16, input);                                        \
+  in12 = load(offset + 6 * 16, input);                                        \
+  in13 = load(offset + 7 * 16, input);                                        \
+                                                                              \
+  in20 = load(offset + 8 * 16, input);                                        \
+  in21 = load(offset + 9 * 16, input);                                        \
+  in22 = load(offset + 10 * 16, input);                                       \
+  in23 = load(offset + 11 * 16, input);                                       \
+                                                                              \
+  in30 = load(offset + 12 * 16, input);                                       \
+  in31 = load(offset + 13 * 16, input);                                       \
+  in32 = load(offset + 14 * 16, input);                                       \
+  in33 = load(offset + 15 * 16, input);                                       \
+                                                                              \
+  in40 = load(offset + 16 * 16, input);                                       \
+  in41 = load(offset + 17 * 16, input);                                       \
+  in42 = load(offset + 18 * 16, input);                                       \
+  in43 = load(offset + 19 * 16, input);                                       \
+                                                                              \
+  in50 = load(offset + 20 * 16, input);                                       \
+  in51 = load(offset + 21 * 16, input);                                       \
+  in52 = load(offset + 22 * 16, input);                                       \
+  in53 = load(offset + 23 * 16, input);                                       \
+                                                                              \
+  in60 = load(offset + 24 * 16, input);                                       \
+  in61 = load(offset + 25 * 16, input);                                       \
+  in62 = load(offset + 26 * 16, input);                                       \
+  in63 = load(offset + 27 * 16, input);                                       \
+                                                                              \
+  /* load the last row from the 8x32 block*/                                  \
+  in70 = load(offset + 28 * 16, input);                                       \
+  in71 = load(offset + 29 * 16, input);                                       \
+  in72 = load(offset + 30 * 16, input);                                       \
+  in73 = load(offset + 31 * 16, input);
+
+/* for the: temp1 = -step[x] * cospi_q + step[y] * cospi_z
+ *          temp2 = step[x] * cospi_z + step[y] * cospi_q */
+#define STEP32(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1)              \
+  tmp16_0 = vec_mergeh(inpt0, inpt1);                                     \
+  tmp16_1 = vec_mergel(inpt0, inpt1);                                     \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi1), vec_mule(tmp16_0, cospi0)); \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi1), vec_mule(tmp16_1, cospi0)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                          \
+  DCT_CONST_ROUND_SHIFT(temp11);                                          \
+  outpt0 = vec_packs(temp10, temp11);                                     \
+  temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
+  temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                          \
+  DCT_CONST_ROUND_SHIFT(temp11);                                          \
+  outpt1 = vec_packs(temp10, temp11);
+
+/* for the: temp1 = -step[x] * cospi_q - step[y] * cospi_z
+ *          temp2 = -step[x] * cospi_z + step[y] * cospi_q */
+#define STEP32_1(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1, cospi1m)    \
+  tmp16_0 = vec_mergeh(inpt0, inpt1);                                      \
+  tmp16_1 = vec_mergel(inpt0, inpt1);                                      \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi1m), vec_mule(tmp16_0, cospi0)); \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi1m), vec_mule(tmp16_1, cospi0)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                           \
+  DCT_CONST_ROUND_SHIFT(temp11);                                           \
+  outpt0 = vec_packs(temp10, temp11);                                      \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi0), vec_mule(tmp16_0, cospi1));  \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi0), vec_mule(tmp16_1, cospi1));  \
+  DCT_CONST_ROUND_SHIFT(temp10);                                           \
+  DCT_CONST_ROUND_SHIFT(temp11);                                           \
+  outpt1 = vec_packs(temp10, temp11);
+
+#define IDCT32(in0, in1, in2, in3, out)                                \
+                                                                       \
+  /* stage 1 */                                                        \
+  /* out[0][0] = in[0][0]; */                                          \
+  out[0][1] = in2[0];                                                  \
+  out[0][2] = in1[0];                                                  \
+  out[0][3] = in3[0];                                                  \
+  out[0][4] = in0[4];                                                  \
+  out[0][5] = in2[4];                                                  \
+  out[0][6] = in1[4];                                                  \
+  out[0][7] = in3[4];                                                  \
+  out[1][0] = in0[2];                                                  \
+  out[1][1] = in2[2];                                                  \
+  out[1][2] = in1[2];                                                  \
+  out[1][3] = in3[2];                                                  \
+  out[1][4] = in0[6];                                                  \
+  out[1][5] = in2[6];                                                  \
+  out[1][6] = in1[6];                                                  \
+  out[1][7] = in3[6];                                                  \
+                                                                       \
+  STEP8_0(in0[1], in3[7], out[2][0], out[3][7], cospi31_v, cospi1_v);  \
+  STEP8_0(in2[1], in1[7], out[2][1], out[3][6], cospi15_v, cospi17_v); \
+  STEP8_0(in1[1], in2[7], out[2][2], out[3][5], cospi23_v, cospi9_v);  \
+  STEP8_0(in3[1], in0[7], out[2][3], out[3][4], cospi7_v, cospi25_v);  \
+  STEP8_0(in0[5], in3[3], out[2][4], out[3][3], cospi27_v, cospi5_v);  \
+  STEP8_0(in2[5], in1[3], out[2][5], out[3][2], cospi11_v, cospi21_v); \
+  STEP8_0(in1[5], in2[3], out[2][6], out[3][1], cospi19_v, cospi13_v); \
+  STEP8_0(in3[5], in0[3], out[2][7], out[3][0], cospi3_v, cospi29_v);  \
+                                                                       \
+  /* stage 2 */                                                        \
+  /* in0[0] = out[0][0]; */                                            \
+  in0[1] = out[0][1];                                                  \
+  in0[2] = out[0][2];                                                  \
+  in0[3] = out[0][3];                                                  \
+  in0[4] = out[0][4];                                                  \
+  in0[5] = out[0][5];                                                  \
+  in0[6] = out[0][6];                                                  \
+  in0[7] = out[0][7];                                                  \
+                                                                       \
+  STEP8_0(out[1][0], out[1][7], in1[0], in1[7], cospi30_v, cospi2_v);  \
+  STEP8_0(out[1][1], out[1][6], in1[1], in1[6], cospi14_v, cospi18_v); \
+  STEP8_0(out[1][2], out[1][5], in1[2], in1[5], cospi22_v, cospi10_v); \
+  STEP8_0(out[1][3], out[1][4], in1[3], in1[4], cospi6_v, cospi26_v);  \
+                                                                       \
+  in2[0] = vec_add(out[2][0], out[2][1]);                              \
+  in2[1] = vec_sub(out[2][0], out[2][1]);                              \
+  in2[2] = vec_sub(out[2][3], out[2][2]);                              \
+  in2[3] = vec_add(out[2][3], out[2][2]);                              \
+  in2[4] = vec_add(out[2][4], out[2][5]);                              \
+  in2[5] = vec_sub(out[2][4], out[2][5]);                              \
+  in2[6] = vec_sub(out[2][7], out[2][6]);                              \
+  in2[7] = vec_add(out[2][7], out[2][6]);                              \
+  in3[0] = vec_add(out[3][0], out[3][1]);                              \
+  in3[1] = vec_sub(out[3][0], out[3][1]);                              \
+  in3[2] = vec_sub(out[3][3], out[3][2]);                              \
+  in3[3] = vec_add(out[3][3], out[3][2]);                              \
+  in3[4] = vec_add(out[3][4], out[3][5]);                              \
+  in3[5] = vec_sub(out[3][4], out[3][5]);                              \
+  in3[6] = vec_sub(out[3][7], out[3][6]);                              \
+  in3[7] = vec_add(out[3][6], out[3][7]);                              \
+                                                                       \
+  /* stage 3 */                                                        \
+  out[0][0] = in0[0];                                                  \
+  out[0][1] = in0[1];                                                  \
+  out[0][2] = in0[2];                                                  \
+  out[0][3] = in0[3];                                                  \
+                                                                       \
+  STEP8_0(in0[4], in0[7], out[0][4], out[0][7], cospi28_v, cospi4_v);  \
+  STEP8_0(in0[5], in0[6], out[0][5], out[0][6], cospi12_v, cospi20_v); \
+                                                                       \
+  out[1][0] = vec_add(in1[0], in1[1]);                                 \
+  out[1][1] = vec_sub(in1[0], in1[1]);                                 \
+  out[1][2] = vec_sub(in1[3], in1[2]);                                 \
+  out[1][3] = vec_add(in1[2], in1[3]);                                 \
+  out[1][4] = vec_add(in1[4], in1[5]);                                 \
+  out[1][5] = vec_sub(in1[4], in1[5]);                                 \
+  out[1][6] = vec_sub(in1[7], in1[6]);                                 \
+  out[1][7] = vec_add(in1[6], in1[7]);                                 \
+                                                                       \
+  out[2][0] = in2[0];                                                  \
+  out[3][7] = in3[7];                                                  \
+  STEP32(in2[1], in3[6], out[2][1], out[3][6], cospi4_v, cospi28_v);   \
+  STEP32_1(in2[2], in3[5], out[2][2], out[3][5], cospi28_v, cospi4_v,  \
+           cospi4m_v);                                                 \
+  out[2][3] = in2[3];                                                  \
+  out[2][4] = in2[4];                                                  \
+  STEP32(in2[5], in3[2], out[2][5], out[3][2], cospi20_v, cospi12_v);  \
+  STEP32_1(in2[6], in3[1], out[2][6], out[3][1], cospi12_v, cospi20_v, \
+           cospi20m_v);                                                \
+  out[2][7] = in2[7];                                                  \
+  out[3][0] = in3[0];                                                  \
+  out[3][3] = in3[3];                                                  \
+  out[3][4] = in3[4];                                                  \
+                                                                       \
+  /* stage 4 */                                                        \
+  STEP16_1(out[0][0], out[0][1], in0[1], in0[0], cospi16_v);           \
+  STEP8_0(out[0][2], out[0][3], in0[2], in0[3], cospi24_v, cospi8_v);  \
+  in0[4] = vec_add(out[0][4], out[0][5]);                              \
+  in0[5] = vec_sub(out[0][4], out[0][5]);                              \
+  in0[6] = vec_sub(out[0][7], out[0][6]);                              \
+  in0[7] = vec_add(out[0][7], out[0][6]);                              \
+                                                                       \
+  in1[0] = out[1][0];                                                  \
+  in1[7] = out[1][7];                                                  \
+  STEP32(out[1][1], out[1][6], in1[1], in1[6], cospi8_v, cospi24_v);   \
+  STEP32_1(out[1][2], out[1][5], in1[2], in1[5], cospi24_v, cospi8_v,  \
+           cospi8m_v);                                                 \
+  in1[3] = out[1][3];                                                  \
+  in1[4] = out[1][4];                                                  \
+                                                                       \
+  in2[0] = vec_add(out[2][0], out[2][3]);                              \
+  in2[1] = vec_add(out[2][1], out[2][2]);                              \
+  in2[2] = vec_sub(out[2][1], out[2][2]);                              \
+  in2[3] = vec_sub(out[2][0], out[2][3]);                              \
+  in2[4] = vec_sub(out[2][7], out[2][4]);                              \
+  in2[5] = vec_sub(out[2][6], out[2][5]);                              \
+  in2[6] = vec_add(out[2][5], out[2][6]);                              \
+  in2[7] = vec_add(out[2][4], out[2][7]);                              \
+                                                                       \
+  in3[0] = vec_add(out[3][0], out[3][3]);                              \
+  in3[1] = vec_add(out[3][1], out[3][2]);                              \
+  in3[2] = vec_sub(out[3][1], out[3][2]);                              \
+  in3[3] = vec_sub(out[3][0], out[3][3]);                              \
+  in3[4] = vec_sub(out[3][7], out[3][4]);                              \
+  in3[5] = vec_sub(out[3][6], out[3][5]);                              \
+  in3[6] = vec_add(out[3][5], out[3][6]);                              \
+  in3[7] = vec_add(out[3][4], out[3][7]);                              \
+                                                                       \
+  /* stage 5 */                                                        \
+  out[0][0] = vec_add(in0[0], in0[3]);                                 \
+  out[0][1] = vec_add(in0[1], in0[2]);                                 \
+  out[0][2] = vec_sub(in0[1], in0[2]);                                 \
+  out[0][3] = vec_sub(in0[0], in0[3]);                                 \
+  out[0][4] = in0[4];                                                  \
+  STEP16_1(in0[6], in0[5], out[0][5], out[0][6], cospi16_v);           \
+  out[0][7] = in0[7];                                                  \
+                                                                       \
+  out[1][0] = vec_add(in1[0], in1[3]);                                 \
+  out[1][1] = vec_add(in1[1], in1[2]);                                 \
+  out[1][2] = vec_sub(in1[1], in1[2]);                                 \
+  out[1][3] = vec_sub(in1[0], in1[3]);                                 \
+  out[1][4] = vec_sub(in1[7], in1[4]);                                 \
+  out[1][5] = vec_sub(in1[6], in1[5]);                                 \
+  out[1][6] = vec_add(in1[5], in1[6]);                                 \
+  out[1][7] = vec_add(in1[4], in1[7]);                                 \
+                                                                       \
+  out[2][0] = in2[0];                                                  \
+  out[2][1] = in2[1];                                                  \
+  STEP32(in2[2], in3[5], out[2][2], out[3][5], cospi8_v, cospi24_v);   \
+  STEP32(in2[3], in3[4], out[2][3], out[3][4], cospi8_v, cospi24_v);   \
+  STEP32_1(in2[4], in3[3], out[2][4], out[3][3], cospi24_v, cospi8_v,  \
+           cospi8m_v);                                                 \
+  STEP32_1(in2[5], in3[2], out[2][5], out[3][2], cospi24_v, cospi8_v,  \
+           cospi8m_v);                                                 \
+  out[2][6] = in2[6];                                                  \
+  out[2][7] = in2[7];                                                  \
+  out[3][0] = in3[0];                                                  \
+  out[3][1] = in3[1];                                                  \
+  out[3][6] = in3[6];                                                  \
+  out[3][7] = in3[7];                                                  \
+                                                                       \
+  /* stage 6 */                                                        \
+  in0[0] = vec_add(out[0][0], out[0][7]);                              \
+  in0[1] = vec_add(out[0][1], out[0][6]);                              \
+  in0[2] = vec_add(out[0][2], out[0][5]);                              \
+  in0[3] = vec_add(out[0][3], out[0][4]);                              \
+  in0[4] = vec_sub(out[0][3], out[0][4]);                              \
+  in0[5] = vec_sub(out[0][2], out[0][5]);                              \
+  in0[6] = vec_sub(out[0][1], out[0][6]);                              \
+  in0[7] = vec_sub(out[0][0], out[0][7]);                              \
+  in1[0] = out[1][0];                                                  \
+  in1[1] = out[1][1];                                                  \
+  STEP16_1(out[1][5], out[1][2], in1[2], in1[5], cospi16_v);           \
+  STEP16_1(out[1][4], out[1][3], in1[3], in1[4], cospi16_v);           \
+  in1[6] = out[1][6];                                                  \
+  in1[7] = out[1][7];                                                  \
+                                                                       \
+  in2[0] = vec_add(out[2][0], out[2][7]);                              \
+  in2[1] = vec_add(out[2][1], out[2][6]);                              \
+  in2[2] = vec_add(out[2][2], out[2][5]);                              \
+  in2[3] = vec_add(out[2][3], out[2][4]);                              \
+  in2[4] = vec_sub(out[2][3], out[2][4]);                              \
+  in2[5] = vec_sub(out[2][2], out[2][5]);                              \
+  in2[6] = vec_sub(out[2][1], out[2][6]);                              \
+  in2[7] = vec_sub(out[2][0], out[2][7]);                              \
+                                                                       \
+  in3[0] = vec_sub(out[3][7], out[3][0]);                              \
+  in3[1] = vec_sub(out[3][6], out[3][1]);                              \
+  in3[2] = vec_sub(out[3][5], out[3][2]);                              \
+  in3[3] = vec_sub(out[3][4], out[3][3]);                              \
+  in3[4] = vec_add(out[3][4], out[3][3]);                              \
+  in3[5] = vec_add(out[3][5], out[3][2]);                              \
+  in3[6] = vec_add(out[3][6], out[3][1]);                              \
+  in3[7] = vec_add(out[3][7], out[3][0]);                              \
+                                                                       \
+  /* stage 7 */                                                        \
+  out[0][0] = vec_add(in0[0], in1[7]);                                 \
+  out[0][1] = vec_add(in0[1], in1[6]);                                 \
+  out[0][2] = vec_add(in0[2], in1[5]);                                 \
+  out[0][3] = vec_add(in0[3], in1[4]);                                 \
+  out[0][4] = vec_add(in0[4], in1[3]);                                 \
+  out[0][5] = vec_add(in0[5], in1[2]);                                 \
+  out[0][6] = vec_add(in0[6], in1[1]);                                 \
+  out[0][7] = vec_add(in0[7], in1[0]);                                 \
+  out[1][0] = vec_sub(in0[7], in1[0]);                                 \
+  out[1][1] = vec_sub(in0[6], in1[1]);                                 \
+  out[1][2] = vec_sub(in0[5], in1[2]);                                 \
+  out[1][3] = vec_sub(in0[4], in1[3]);                                 \
+  out[1][4] = vec_sub(in0[3], in1[4]);                                 \
+  out[1][5] = vec_sub(in0[2], in1[5]);                                 \
+  out[1][6] = vec_sub(in0[1], in1[6]);                                 \
+  out[1][7] = vec_sub(in0[0], in1[7]);                                 \
+                                                                       \
+  out[2][0] = in2[0];                                                  \
+  out[2][1] = in2[1];                                                  \
+  out[2][2] = in2[2];                                                  \
+  out[2][3] = in2[3];                                                  \
+  STEP16_1(in3[3], in2[4], out[2][4], out[3][3], cospi16_v);           \
+  STEP16_1(in3[2], in2[5], out[2][5], out[3][2], cospi16_v);           \
+  STEP16_1(in3[1], in2[6], out[2][6], out[3][1], cospi16_v);           \
+  STEP16_1(in3[0], in2[7], out[2][7], out[3][0], cospi16_v);           \
+  out[3][4] = in3[4];                                                  \
+  out[3][5] = in3[5];                                                  \
+  out[3][6] = in3[6];                                                  \
+  out[3][7] = in3[7];                                                  \
+                                                                       \
+  /* final */                                                          \
+  in0[0] = vec_add(out[0][0], out[3][7]);                              \
+  in0[1] = vec_add(out[0][1], out[3][6]);                              \
+  in0[2] = vec_add(out[0][2], out[3][5]);                              \
+  in0[3] = vec_add(out[0][3], out[3][4]);                              \
+  in0[4] = vec_add(out[0][4], out[3][3]);                              \
+  in0[5] = vec_add(out[0][5], out[3][2]);                              \
+  in0[6] = vec_add(out[0][6], out[3][1]);                              \
+  in0[7] = vec_add(out[0][7], out[3][0]);                              \
+  in1[0] = vec_add(out[1][0], out[2][7]);                              \
+  in1[1] = vec_add(out[1][1], out[2][6]);                              \
+  in1[2] = vec_add(out[1][2], out[2][5]);                              \
+  in1[3] = vec_add(out[1][3], out[2][4]);                              \
+  in1[4] = vec_add(out[1][4], out[2][3]);                              \
+  in1[5] = vec_add(out[1][5], out[2][2]);                              \
+  in1[6] = vec_add(out[1][6], out[2][1]);                              \
+  in1[7] = vec_add(out[1][7], out[2][0]);                              \
+  in2[0] = vec_sub(out[1][7], out[2][0]);                              \
+  in2[1] = vec_sub(out[1][6], out[2][1]);                              \
+  in2[2] = vec_sub(out[1][5], out[2][2]);                              \
+  in2[3] = vec_sub(out[1][4], out[2][3]);                              \
+  in2[4] = vec_sub(out[1][3], out[2][4]);                              \
+  in2[5] = vec_sub(out[1][2], out[2][5]);                              \
+  in2[6] = vec_sub(out[1][1], out[2][6]);                              \
+  in2[7] = vec_sub(out[1][0], out[2][7]);                              \
+  in3[0] = vec_sub(out[0][7], out[3][0]);                              \
+  in3[1] = vec_sub(out[0][6], out[3][1]);                              \
+  in3[2] = vec_sub(out[0][5], out[3][2]);                              \
+  in3[3] = vec_sub(out[0][4], out[3][3]);                              \
+  in3[4] = vec_sub(out[0][3], out[3][4]);                              \
+  in3[5] = vec_sub(out[0][2], out[3][5]);                              \
+  in3[6] = vec_sub(out[0][1], out[3][6]);                              \
+  in3[7] = vec_sub(out[0][0], out[3][7]);
+
+// NOT A FULL TRANSPOSE! Transposes just each 8x8 block in each row,
+// does not transpose rows
+#define TRANSPOSE_8x32(in, out)                                                \
+  /* transpose 4 of 8x8 blocks */                                              \
+  TRANSPOSE8x8(in[0][0], in[0][1], in[0][2], in[0][3], in[0][4], in[0][5],     \
+               in[0][6], in[0][7], out[0][0], out[0][1], out[0][2], out[0][3], \
+               out[0][4], out[0][5], out[0][6], out[0][7]);                    \
+  TRANSPOSE8x8(in[1][0], in[1][1], in[1][2], in[1][3], in[1][4], in[1][5],     \
+               in[1][6], in[1][7], out[1][0], out[1][1], out[1][2], out[1][3], \
+               out[1][4], out[1][5], out[1][6], out[1][7]);                    \
+  TRANSPOSE8x8(in[2][0], in[2][1], in[2][2], in[2][3], in[2][4], in[2][5],     \
+               in[2][6], in[2][7], out[2][0], out[2][1], out[2][2], out[2][3], \
+               out[2][4], out[2][5], out[2][6], out[2][7]);                    \
+  TRANSPOSE8x8(in[3][0], in[3][1], in[3][2], in[3][3], in[3][4], in[3][5],     \
+               in[3][6], in[3][7], out[3][0], out[3][1], out[3][2], out[3][3], \
+               out[3][4], out[3][5], out[3][6], out[3][7]);
+
+#define PIXEL_ADD_STORE32(in0, in1, in2, in3, step)        \
+  dst = vec_vsx_ld((step)*stride, dest);                   \
+  d_uh = (int16x8_t)vec_mergeh(dst, zerov);                \
+  d_ul = (int16x8_t)vec_mergel(dst, zerov);                \
+  PIXEL_ADD(in0, d_uh, add, shift6);                       \
+  PIXEL_ADD(in1, d_ul, add, shift6);                       \
+  vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride, dest); \
+  dst = vec_vsx_ld((step)*stride + 16, dest);              \
+  d_uh = (int16x8_t)vec_mergeh(dst, zerov);                \
+  d_ul = (int16x8_t)vec_mergel(dst, zerov);                \
+  PIXEL_ADD(in2, d_uh, add, shift6);                       \
+  PIXEL_ADD(in3, d_ul, add, shift6);                       \
+  vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride + 16, dest);
+
+#define ADD_STORE_BLOCK(in, offset)                                      \
+  PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], offset + 0); \
+  PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], offset + 1); \
+  PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], offset + 2); \
+  PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], offset + 3); \
+  PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], offset + 4); \
+  PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], offset + 5); \
+  PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], offset + 6); \
+  PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], offset + 7);
+
+void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  int16x8_t src0[4][8], src1[4][8], src2[4][8], src3[4][8], tmp[4][8];
+  int16x8_t tmp16_0, tmp16_1;
+  int32x4_t temp10, temp11, temp20, temp21, temp30;
+  uint8x16_t dst;
+  int16x8_t d_uh, d_ul;
+  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
+  uint16x8_t shift6 = vec_splat_u16(6);
+  uint8x16_t zerov = vec_splat_u8(0);
+
+  ROUND_SHIFT_INIT;
+
+  LOAD_8x32(load_tran_low, src0[0][0], src0[1][0], src0[2][0], src0[3][0],
+            src0[0][1], src0[1][1], src0[2][1], src0[3][1], src0[0][2],
+            src0[1][2], src0[2][2], src0[3][2], src0[0][3], src0[1][3],
+            src0[2][3], src0[3][3], src0[0][4], src0[1][4], src0[2][4],
+            src0[3][4], src0[0][5], src0[1][5], src0[2][5], src0[3][5],
+            src0[0][6], src0[1][6], src0[2][6], src0[3][6], src0[0][7],
+            src0[1][7], src0[2][7], src0[3][7], 0);
+  // Rows
+  // transpose the first row of 8x8 blocks
+  TRANSPOSE_8x32(src0, tmp);
+  // transform the 32x8 column
+  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src0);
+  TRANSPOSE_8x32(tmp, src0);
+
+  LOAD_8x32(load_tran_low, src1[0][0], src1[1][0], src1[2][0], src1[3][0],
+            src1[0][1], src1[1][1], src1[2][1], src1[3][1], src1[0][2],
+            src1[1][2], src1[2][2], src1[3][2], src1[0][3], src1[1][3],
+            src1[2][3], src1[3][3], src1[0][4], src1[1][4], src1[2][4],
+            src1[3][4], src1[0][5], src1[1][5], src1[2][5], src1[3][5],
+            src1[0][6], src1[1][6], src1[2][6], src1[3][6], src1[0][7],
+            src1[1][7], src1[2][7], src1[3][7], 512);
+  TRANSPOSE_8x32(src1, tmp);
+  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src1);
+  TRANSPOSE_8x32(tmp, src1);
+
+  LOAD_8x32(load_tran_low, src2[0][0], src2[1][0], src2[2][0], src2[3][0],
+            src2[0][1], src2[1][1], src2[2][1], src2[3][1], src2[0][2],
+            src2[1][2], src2[2][2], src2[3][2], src2[0][3], src2[1][3],
+            src2[2][3], src2[3][3], src2[0][4], src2[1][4], src2[2][4],
+            src2[3][4], src2[0][5], src2[1][5], src2[2][5], src2[3][5],
+            src2[0][6], src2[1][6], src2[2][6], src2[3][6], src2[0][7],
+            src2[1][7], src2[2][7], src2[3][7], 1024);
+  TRANSPOSE_8x32(src2, tmp);
+  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src2);
+  TRANSPOSE_8x32(tmp, src2);
+
+  LOAD_8x32(load_tran_low, src3[0][0], src3[1][0], src3[2][0], src3[3][0],
+            src3[0][1], src3[1][1], src3[2][1], src3[3][1], src3[0][2],
+            src3[1][2], src3[2][2], src3[3][2], src3[0][3], src3[1][3],
+            src3[2][3], src3[3][3], src3[0][4], src3[1][4], src3[2][4],
+            src3[3][4], src3[0][5], src3[1][5], src3[2][5], src3[3][5],
+            src3[0][6], src3[1][6], src3[2][6], src3[3][6], src3[0][7],
+            src3[1][7], src3[2][7], src3[3][7], 1536);
+  TRANSPOSE_8x32(src3, tmp);
+  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src3);
+  TRANSPOSE_8x32(tmp, src3);
+
+  // Columns
+  IDCT32(src0[0], src1[0], src2[0], src3[0], tmp);
+  IDCT32(src0[1], src1[1], src2[1], src3[1], tmp);
+  IDCT32(src0[2], src1[2], src2[2], src3[2], tmp);
+  IDCT32(src0[3], src1[3], src2[3], src3[3], tmp);
+
+  ADD_STORE_BLOCK(src0, 0);
+  ADD_STORE_BLOCK(src1, 8);
+  ADD_STORE_BLOCK(src2, 16);
+  ADD_STORE_BLOCK(src3, 24);
+}
diff --git a/libs/libvpx/vpx_dsp/ppc/sad_vsx.c b/libs/libvpx/vpx_dsp/ppc/sad_vsx.c
new file mode 100644
index 0000000000..bb49addae1
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/sad_vsx.c
@@ -0,0 +1,254 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/ppc/types_vsx.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#define PROCESS16(offset)           \
+  v_a = vec_vsx_ld(offset, a);      \
+  v_b = vec_vsx_ld(offset, b);      \
+  v_ah = unpack_to_s16_h(v_a);      \
+  v_al = unpack_to_s16_l(v_a);      \
+  v_bh = unpack_to_s16_h(v_b);      \
+  v_bl = unpack_to_s16_l(v_b);      \
+  v_subh = vec_sub(v_ah, v_bh);     \
+  v_subl = vec_sub(v_al, v_bl);     \
+  v_absh = vec_abs(v_subh);         \
+  v_absl = vec_abs(v_subl);         \
+  v_sad = vec_sum4s(v_absh, v_sad); \
+  v_sad = vec_sum4s(v_absl, v_sad);
+
+#define SAD16(height)                                                     \
+  unsigned int vpx_sad16x##height##_vsx(const uint8_t *a, int a_stride,   \
+                                        const uint8_t *b, int b_stride) { \
+    int y;                                                                \
+    unsigned int sad[4];                                                  \
+    uint8x16_t v_a, v_b;                                                  \
+    int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl;     \
+    int32x4_t v_sad = vec_splat_s32(0);                                   \
+                                                                          \
+    for (y = 0; y < height; y++) {                                        \
+      PROCESS16(0);                                                       \
+                                                                          \
+      a += a_stride;                                                      \
+      b += b_stride;                                                      \
+    }                                                                     \
+    vec_vsx_st((uint32x4_t)v_sad, 0, sad);                                \
+                                                                          \
+    return sad[3] + sad[2] + sad[1] + sad[0];                             \
+  }
+
+#define SAD32(height)                                                     \
+  unsigned int vpx_sad32x##height##_vsx(const uint8_t *a, int a_stride,   \
+                                        const uint8_t *b, int b_stride) { \
+    int y;                                                                \
+    unsigned int sad[4];                                                  \
+    uint8x16_t v_a, v_b;                                                  \
+    int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl;     \
+    int32x4_t v_sad = vec_splat_s32(0);                                   \
+                                                                          \
+    for (y = 0; y < height; y++) {                                        \
+      PROCESS16(0);                                                       \
+      PROCESS16(16);                                                      \
+                                                                          \
+      a += a_stride;                                                      \
+      b += b_stride;                                                      \
+    }                                                                     \
+    vec_vsx_st((uint32x4_t)v_sad, 0, sad);                                \
+                                                                          \
+    return sad[3] + sad[2] + sad[1] + sad[0];                             \
+  }
+
+#define SAD64(height)                                                     \
+  unsigned int vpx_sad64x##height##_vsx(const uint8_t *a, int a_stride,   \
+                                        const uint8_t *b, int b_stride) { \
+    int y;                                                                \
+    unsigned int sad[4];                                                  \
+    uint8x16_t v_a, v_b;                                                  \
+    int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl;     \
+    int32x4_t v_sad = vec_splat_s32(0);                                   \
+                                                                          \
+    for (y = 0; y < height; y++) {                                        \
+      PROCESS16(0);                                                       \
+      PROCESS16(16);                                                      \
+      PROCESS16(32);                                                      \
+      PROCESS16(48);                                                      \
+                                                                          \
+      a += a_stride;                                                      \
+      b += b_stride;                                                      \
+    }                                                                     \
+    vec_vsx_st((uint32x4_t)v_sad, 0, sad);                                \
+                                                                          \
+    return sad[3] + sad[2] + sad[1] + sad[0];                             \
+  }
+
+SAD16(8);
+SAD16(16);
+SAD16(32);
+SAD32(16);
+SAD32(32);
+SAD32(64);
+SAD64(32);
+SAD64(64);
+
+#define SAD16AVG(height)                                                      \
+  unsigned int vpx_sad16x##height##_avg_vsx(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * height]);                     \
+    vpx_comp_avg_pred_vsx(comp_pred, second_pred, 16, height, ref,            \
+                          ref_stride);                                        \
+                                                                              \
+    return vpx_sad16x##height##_vsx(src, src_stride, comp_pred, 16);          \
+  }
+
+#define SAD32AVG(height)                                                      \
+  unsigned int vpx_sad32x##height##_avg_vsx(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * height]);                     \
+    vpx_comp_avg_pred_vsx(comp_pred, second_pred, 32, height, ref,            \
+                          ref_stride);                                        \
+                                                                              \
+    return vpx_sad32x##height##_vsx(src, src_stride, comp_pred, 32);          \
+  }
+
+#define SAD64AVG(height)                                                      \
+  unsigned int vpx_sad64x##height##_avg_vsx(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * height]);                     \
+    vpx_comp_avg_pred_vsx(comp_pred, second_pred, 64, height, ref,            \
+                          ref_stride);                                        \
+    return vpx_sad64x##height##_vsx(src, src_stride, comp_pred, 64);          \
+  }
+
+SAD16AVG(8);
+SAD16AVG(16);
+SAD16AVG(32);
+SAD32AVG(16);
+SAD32AVG(32);
+SAD32AVG(64);
+SAD64AVG(32);
+SAD64AVG(64);
+
+#define PROCESS16_4D(offset, ref, v_h, v_l) \
+  v_b = vec_vsx_ld(offset, ref);            \
+  v_bh = unpack_to_s16_h(v_b);              \
+  v_bl = unpack_to_s16_l(v_b);              \
+  v_subh = vec_sub(v_h, v_bh);              \
+  v_subl = vec_sub(v_l, v_bl);              \
+  v_absh = vec_abs(v_subh);                 \
+  v_absl = vec_abs(v_subl);                 \
+  v_sad = vec_sum4s(v_absh, v_sad);         \
+  v_sad = vec_sum4s(v_absl, v_sad);
+
+#define UNPACK_SRC(offset, srcv_h, srcv_l) \
+  v_a = vec_vsx_ld(offset, src);           \
+  srcv_h = unpack_to_s16_h(v_a);           \
+  srcv_l = unpack_to_s16_l(v_a);
+
+#define SAD16_4D(height)                                                  \
+  void vpx_sad16x##height##x4d_vsx(const uint8_t *src, int src_stride,    \
+                                   const uint8_t *const ref_array[],      \
+                                   int ref_stride, uint32_t *sad_array) { \
+    int i;                                                                \
+    int y;                                                                \
+    unsigned int sad[4];                                                  \
+    uint8x16_t v_a, v_b;                                                  \
+    int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl;     \
+                                                                          \
+    for (i = 0; i < 4; i++) sad_array[i] = 0;                             \
+                                                                          \
+    for (y = 0; y < height; y++) {                                        \
+      UNPACK_SRC(y *src_stride, v_ah, v_al);                              \
+      for (i = 0; i < 4; i++) {                                           \
+        int32x4_t v_sad = vec_splat_s32(0);                               \
+        PROCESS16_4D(y *ref_stride, ref_array[i], v_ah, v_al);            \
+                                                                          \
+        vec_vsx_st((uint32x4_t)v_sad, 0, sad);                            \
+        sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]);              \
+      }                                                                   \
+    }                                                                     \
+  }
+
+#define SAD32_4D(height)                                                  \
+  void vpx_sad32x##height##x4d_vsx(const uint8_t *src, int src_stride,    \
+                                   const uint8_t *const ref_array[],      \
+                                   int ref_stride, uint32_t *sad_array) { \
+    int i;                                                                \
+    int y;                                                                \
+    unsigned int sad[4];                                                  \
+    uint8x16_t v_a, v_b;                                                  \
+    int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl;                     \
+    int16x8_t v_absh, v_absl, v_subh, v_subl;                             \
+                                                                          \
+    for (i = 0; i < 4; i++) sad_array[i] = 0;                             \
+                                                                          \
+    for (y = 0; y < height; y++) {                                        \
+      UNPACK_SRC(y *src_stride, v_ah1, v_al1);                            \
+      UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2);                       \
+      for (i = 0; i < 4; i++) {                                           \
+        int32x4_t v_sad = vec_splat_s32(0);                               \
+        PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1);          \
+        PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2);     \
+                                                                          \
+        vec_vsx_st((uint32x4_t)v_sad, 0, sad);                            \
+        sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]);              \
+      }                                                                   \
+    }                                                                     \
+  }
+
+#define SAD64_4D(height)                                                  \
+  void vpx_sad64x##height##x4d_vsx(const uint8_t *src, int src_stride,    \
+                                   const uint8_t *const ref_array[],      \
+                                   int ref_stride, uint32_t *sad_array) { \
+    int i;                                                                \
+    int y;                                                                \
+    unsigned int sad[4];                                                  \
+    uint8x16_t v_a, v_b;                                                  \
+    int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl;                     \
+    int16x8_t v_ah3, v_al3, v_ah4, v_al4;                                 \
+    int16x8_t v_absh, v_absl, v_subh, v_subl;                             \
+                                                                          \
+    for (i = 0; i < 4; i++) sad_array[i] = 0;                             \
+                                                                          \
+    for (y = 0; y < height; y++) {                                        \
+      UNPACK_SRC(y *src_stride, v_ah1, v_al1);                            \
+      UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2);                       \
+      UNPACK_SRC(y *src_stride + 32, v_ah3, v_al3);                       \
+      UNPACK_SRC(y *src_stride + 48, v_ah4, v_al4);                       \
+      for (i = 0; i < 4; i++) {                                           \
+        int32x4_t v_sad = vec_splat_s32(0);                               \
+        PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1);          \
+        PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2);     \
+        PROCESS16_4D(y *ref_stride + 32, ref_array[i], v_ah3, v_al3);     \
+        PROCESS16_4D(y *ref_stride + 48, ref_array[i], v_ah4, v_al4);     \
+                                                                          \
+        vec_vsx_st((uint32x4_t)v_sad, 0, sad);                            \
+        sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]);              \
+      }                                                                   \
+    }                                                                     \
+  }
+
+SAD16_4D(8);
+SAD16_4D(16);
+SAD16_4D(32);
+SAD32_4D(16);
+SAD32_4D(32);
+SAD32_4D(64);
+SAD64_4D(32);
+SAD64_4D(64);
diff --git a/libs/libvpx/vpx_dsp/ppc/transpose_vsx.h b/libs/libvpx/vpx_dsp/ppc/transpose_vsx.h
new file mode 100644
index 0000000000..f02556d522
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/transpose_vsx.h
@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_PPC_TRANSPOSE_VSX_H_
+#define VPX_DSP_PPC_TRANSPOSE_VSX_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static INLINE void vpx_transpose_s16_8x8(int16x8_t v[8]) {
+  // d = vec_mergeh(a,b):
+  // The even elements of the result are obtained left-to-right,
+  // from the high elements of a.
+  // The odd elements of the result are obtained left-to-right,
+  // from the high elements of b.
+  //
+  // d = vec_mergel(a,b):
+  // The even elements of the result are obtained left-to-right,
+  // from the low elements of a.
+  // The odd elements of the result are obtained left-to-right,
+  // from the low elements of b.
+
+  // Example, starting with:
+  // v[0]: 00 01 02 03 04 05 06 07
+  // v[1]: 10 11 12 13 14 15 16 17
+  // v[2]: 20 21 22 23 24 25 26 27
+  // v[3]: 30 31 32 33 34 35 36 37
+  // v[4]: 40 41 42 43 44 45 46 47
+  // v[5]: 50 51 52 53 54 55 56 57
+  // v[6]: 60 61 62 63 64 65 66 67
+  // v[7]: 70 71 72 73 74 75 76 77
+
+  int16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+
+  b0 = vec_mergeh(v[0], v[4]);
+  b1 = vec_mergel(v[0], v[4]);
+  b2 = vec_mergeh(v[1], v[5]);
+  b3 = vec_mergel(v[1], v[5]);
+  b4 = vec_mergeh(v[2], v[6]);
+  b5 = vec_mergel(v[2], v[6]);
+  b6 = vec_mergeh(v[3], v[7]);
+  b7 = vec_mergel(v[3], v[7]);
+
+  // After first merge operation
+  // b0: 00 40 01 41 02 42 03 43
+  // b1: 04 44 05 45 06 46 07 47
+  // b2: 10 50 11 51 12 52 13 53
+  // b3: 14 54 15 55 16 56 17 57
+  // b4: 20 60 21 61 22 62 23 63
+  // b5: 24 64 25 65 26 66 27 67
+  // b6: 30 70 31 71 32 62 33 73
+  // b7: 34 74 35 75 36 76 37 77
+
+  c0 = vec_mergeh(b0, b4);
+  c1 = vec_mergel(b0, b4);
+  c2 = vec_mergeh(b1, b5);
+  c3 = vec_mergel(b1, b5);
+  c4 = vec_mergeh(b2, b6);
+  c5 = vec_mergel(b2, b6);
+  c6 = vec_mergeh(b3, b7);
+  c7 = vec_mergel(b3, b7);
+
+  // After second merge operation
+  // c0: 00 20 40 60 01 21 41 61
+  // c1: 02 22 42 62 03 23 43 63
+  // c2: 04 24 44 64 05 25 45 65
+  // c3: 06 26 46 66 07 27 47 67
+  // c4: 10 30 50 70 11 31 51 71
+  // c5: 12 32 52 72 13 33 53 73
+  // c6: 14 34 54 74 15 35 55 75
+  // c7: 16 36 56 76 17 37 57 77
+
+  v[0] = vec_mergeh(c0, c4);
+  v[1] = vec_mergel(c0, c4);
+  v[2] = vec_mergeh(c1, c5);
+  v[3] = vec_mergel(c1, c5);
+  v[4] = vec_mergeh(c2, c6);
+  v[5] = vec_mergel(c2, c6);
+  v[6] = vec_mergeh(c3, c7);
+  v[7] = vec_mergel(c3, c7);
+
+  // After last merge operation
+  // v[0]: 00 10 20 30 40 50 60 70
+  // v[1]: 01 11 21 31 41 51 61 71
+  // v[2]: 02 12 22 32 42 52 62 72
+  // v[3]: 03 13 23 33 43 53 63 73
+  // v[4]: 04 14 24 34 44 54 64 74
+  // v[5]: 05 15 25 35 45 55 65 75
+  // v[6]: 06 16 26 36 46 56 66 76
+  // v[7]: 07 17 27 37 47 57 67 77
+}
+
+#endif  // VPX_DSP_PPC_TRANSPOSE_VSX_H_
diff --git a/libs/libvpx/vpx_dsp/ppc/types_vsx.h b/libs/libvpx/vpx_dsp/ppc/types_vsx.h
new file mode 100644
index 0000000000..f611d02d2d
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/types_vsx.h
@@ -0,0 +1,68 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_PPC_TYPES_VSX_H_
+#define VPX_DSP_PPC_TYPES_VSX_H_
+
+#include <altivec.h>
+
+typedef vector signed char int8x16_t;
+typedef vector unsigned char uint8x16_t;
+typedef vector signed short int16x8_t;
+typedef vector unsigned short uint16x8_t;
+typedef vector signed int int32x4_t;
+typedef vector unsigned int uint32x4_t;
+
+#ifdef __clang__
+static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                           0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                           0x14, 0x15, 0x16, 0x17 };
+static const uint8x16_t xxpermdi1_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                           0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B,
+                                           0x1C, 0x1D, 0x1E, 0x1F };
+static const uint8x16_t xxpermdi2_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                           0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13,
+                                           0x14, 0x15, 0x16, 0x17 };
+static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                           0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B,
+                                           0x1C, 0x1D, 0x1E, 0x1F };
+#define xxpermdi(a, b, c) vec_perm(a, b, xxpermdi##c##_perm)
+#elif defined(__GNUC__) && \
+    (__GNUC__ > 6 || (__GNUC__ == 6 && __GNUC_MINOR__ >= 3))
+#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c)
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define unpack_to_u16_h(v) \
+  (uint16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v)
+#define unpack_to_u16_l(v) \
+  (uint16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v)
+#define unpack_to_s16_h(v) \
+  (int16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v)
+#define unpack_to_s16_l(v) \
+  (int16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v)
+#ifndef xxpermdi
+#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c)
+#endif
+#else
+#define unpack_to_u16_h(v) \
+  (uint16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0))
+#define unpack_to_u16_l(v) \
+  (uint16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0))
+#define unpack_to_s16_h(v) \
+  (int16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0))
+#define unpack_to_s16_l(v) \
+  (int16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0))
+#ifndef xxpermdi
+#define xxpermdi(a, b, c) vec_xxpermdi(b, a, ((c >> 1) | (c & 1) << 1) ^ 3)
+#endif
+#endif
+
+#endif  // VPX_DSP_PPC_TYPES_VSX_H_
diff --git a/libs/libvpx/vpx_dsp/ppc/variance_vsx.c b/libs/libvpx/vpx_dsp/ppc/variance_vsx.c
new file mode 100644
index 0000000000..1efe2f0056
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/variance_vsx.c
@@ -0,0 +1,103 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static inline uint8x16_t read4x2(const uint8_t *a, int stride) {
+  const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a);
+  const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride);
+
+  return (uint8x16_t)vec_mergeh(a0, a1);
+}
+
+uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride) {
+  int distortion;
+
+  const int16x8_t a0 = unpack_to_s16_h(read4x2(a, a_stride));
+  const int16x8_t a1 = unpack_to_s16_h(read4x2(a + a_stride * 2, a_stride));
+  const int16x8_t b0 = unpack_to_s16_h(read4x2(b, b_stride));
+  const int16x8_t b1 = unpack_to_s16_h(read4x2(b + b_stride * 2, b_stride));
+  const int16x8_t d0 = vec_sub(a0, b0);
+  const int16x8_t d1 = vec_sub(a1, b1);
+  const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0)));
+  const int32x4_t d = vec_splat(vec_sums(ds, vec_splat_s32(0)), 3);
+
+  vec_ste(d, 0, &distortion);
+
+  return distortion;
+}
+
+// TODO(lu_zero): Unroll
+uint32_t vpx_get_mb_ss_vsx(const int16_t *a) {
+  unsigned int i, sum = 0;
+  int32x4_t s = vec_splat_s32(0);
+
+  for (i = 0; i < 256; i += 8) {
+    const int16x8_t v = vec_vsx_ld(0, a + i);
+    s = vec_msum(v, v, s);
+  }
+
+  s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
+
+  vec_ste((uint32x4_t)s, 0, &sum);
+
+  return sum;
+}
+
+void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width,
+                           int height, const uint8_t *ref, int ref_stride) {
+  int i, j;
+  /* comp_pred and pred must be 16 byte aligned. */
+  assert(((intptr_t)comp_pred & 0xf) == 0);
+  assert(((intptr_t)pred & 0xf) == 0);
+  if (width >= 16) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 16) {
+        const uint8x16_t v = vec_avg(vec_vsx_ld(j, pred), vec_vsx_ld(j, ref));
+        vec_vsx_st(v, j, comp_pred);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else if (width == 8) {
+    // Process 2 lines at time
+    for (i = 0; i < height / 2; ++i) {
+      const uint8x16_t r0 = vec_vsx_ld(0, ref);
+      const uint8x16_t r1 = vec_vsx_ld(0, ref + ref_stride);
+      const uint8x16_t r = xxpermdi(r0, r1, 0);
+      const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
+      vec_vsx_st(v, 0, comp_pred);
+      comp_pred += 16;  // width * 2;
+      pred += 16;       // width * 2;
+      ref += ref_stride * 2;
+    }
+  } else {
+    assert(width == 4);
+    // process 4 lines at time
+    for (i = 0; i < height / 4; ++i) {
+      const uint32x4_t r0 = (uint32x4_t)vec_vsx_ld(0, ref);
+      const uint32x4_t r1 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride);
+      const uint32x4_t r2 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 2);
+      const uint32x4_t r3 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 3);
+      const uint8x16_t r =
+          (uint8x16_t)xxpermdi(vec_mergeh(r0, r1), vec_mergeh(r2, r3), 0);
+      const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
+      vec_vsx_st(v, 0, comp_pred);
+      comp_pred += 16;  // width * 4;
+      pred += 16;       // width * 4;
+      ref += ref_stride * 4;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c b/libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
new file mode 100644
index 0000000000..5c3ba4576f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
@@ -0,0 +1,394 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <string.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// TODO(lu_zero): unroll
+static inline void copy_w16(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+  int i;
+
+  for (i = h; i--;) {
+    vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static inline void copy_w32(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+  int i;
+
+  for (i = h; i--;) {
+    vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
+    vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+  int i;
+
+  for (i = h; i--;) {
+    vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
+    vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
+    vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
+    vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  switch (w) {
+    case 16: {
+      copy_w16(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      copy_w32(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      copy_w64(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      int i;
+      for (i = h; i--;) {
+        memcpy(dst, src, w);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
+
+static inline void avg_w16(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+  int i;
+
+  for (i = h; i--;) {
+    const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
+    vec_vsx_st(v, 0, dst);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static inline void avg_w32(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+  int i;
+
+  for (i = h; i--;) {
+    const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
+    const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
+    vec_vsx_st(v0, 0, dst);
+    vec_vsx_st(v1, 16, dst);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+  int i;
+
+  for (i = h; i--;) {
+    const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
+    const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
+    const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst));
+    const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst));
+    vec_vsx_st(v0, 0, dst);
+    vec_vsx_st(v1, 16, dst);
+    vec_vsx_st(v2, 32, dst);
+    vec_vsx_st(v3, 48, dst);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *filter, int x0_q4, int x_step_q4,
+                          int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
+  switch (w) {
+    case 16: {
+      avg_w16(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      avg_w32(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      avg_w64(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                         x_step_q4, y0_q4, y_step_q4, w, h);
+      break;
+    }
+  }
+}
+
+static inline void convolve_line(uint8_t *dst, const int16x8_t s,
+                                 const int16x8_t f) {
+  const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0));
+  const int32x4_t bias =
+      vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1));
+  const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS));
+  const uint8x16_t v = vec_splat(
+      vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3);
+  vec_ste(v, 0, dst);
+}
+
+static inline void convolve_line_h(uint8_t *dst, const uint8_t *const src_x,
+                                   const int16_t *const x_filter) {
+  const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x));
+  const int16x8_t f = vec_vsx_ld(0, x_filter);
+
+  convolve_line(dst, s, f);
+}
+
+// TODO(lu_zero): Implement 8x8 and bigger block special cases
+static inline void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *x_filters, int x0_q4,
+                                  int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS],
+                      x_filters[x_q4 & SUBPEL_MASK]);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static inline void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *x_filters, int x0_q4,
+                                      int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      uint8_t v;
+      convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS],
+                      x_filters[x_q4 & SUBPEL_MASK]);
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b,
+                                        uint8x16_t c, uint8x16_t d,
+                                        uint8x16_t e, uint8x16_t f,
+                                        uint8x16_t g, uint8x16_t h) {
+  uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b);
+  uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d);
+  uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f);
+  uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h);
+
+  uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd);
+  uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh);
+
+  return (uint8x16_t)vec_mergeh(abcd, efgh);
+}
+
+static inline void convolve_line_v(uint8_t *dst, const uint8_t *const src_y,
+                                   ptrdiff_t src_stride,
+                                   const int16_t *const y_filter) {
+  uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride);
+  uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride);
+  uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride);
+  uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride);
+  uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride);
+  uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride);
+  uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride);
+  uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride);
+  const int16x8_t f = vec_vsx_ld(0, y_filter);
+  uint8_t buf[16];
+  const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7);
+
+  vec_vsx_st(s, 0, buf);
+
+  convolve_line(dst, unpack_to_s16_h(s), f);
+}
+
+static inline void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *y_filters, int y0_q4,
+                                 int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      convolve_line_v(dst + y * dst_stride,
+                      &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
+                      y_filters[y_q4 & SUBPEL_MASK]);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel *y_filters, int y0_q4,
+                                     int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      uint8_t v;
+      convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
+                      y_filters[y_q4 & SUBPEL_MASK]);
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static inline void convolve(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *const filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+                 filter, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+                y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
+                 h);
+}
+
+void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                     w, h);
+}
+
+void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+
+  convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+                h);
+}
+
+void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
+                    w, h);
+}
+
+void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                       int w, int h) {
+  convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+           y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  // Fixed size intermediate buffer places limits on parameters.
+  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
+  assert(w <= 64);
+  assert(h <= 64);
+
+  vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
+                    y_step_q4, w, h);
+  vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
+}
diff --git a/libs/libvpx/vpx_dsp/prob.h b/libs/libvpx/vpx_dsp/prob.h
index 3127a00bb9..f1cc0eaa10 100644
--- a/libs/libvpx/vpx_dsp/prob.h
+++ b/libs/libvpx/vpx_dsp/prob.h
@@ -11,6 +11,8 @@
 #ifndef VPX_DSP_PROB_H_
 #define VPX_DSP_PROB_H_
 
+#include <assert.h>
+
 #include "./vpx_config.h"
 #include "./vpx_dsp_common.h"
 
@@ -43,17 +45,20 @@ typedef int8_t vpx_tree_index;
 
 typedef const vpx_tree_index vpx_tree[];
 
-static INLINE vpx_prob clip_prob(int p) {
-  return (p > 255) ? 255 : (p < 1) ? 1 : p;
-}
-
 static INLINE vpx_prob get_prob(unsigned int num, unsigned int den) {
-  if (den == 0) return 128u;
-  return clip_prob((int)(((int64_t)num * 256 + (den >> 1)) / den));
+  assert(den != 0);
+  {
+    const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den);
+    // (p > 255) ? 255 : (p < 1) ? 1 : p;
+    const int clipped_prob = p | ((255 - p) >> 23) | (p == 0);
+    return (vpx_prob)clipped_prob;
+  }
 }
 
 static INLINE vpx_prob get_binary_prob(unsigned int n0, unsigned int n1) {
-  return get_prob(n0, n0 + n1);
+  const unsigned int den = n0 + n1;
+  if (den == 0) return 128u;
+  return get_prob(n0, den);
 }
 
 /* This function assumes prob1 and prob2 are already within [1,255] range. */
diff --git a/libs/libvpx/vpx_dsp/quantize.c b/libs/libvpx/vpx_dsp/quantize.c
index 3c7f9832f7..e37ca92ad4 100644
--- a/libs/libvpx/vpx_dsp/quantize.c
+++ b/libs/libvpx/vpx_dsp/quantize.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/quantize.h"
 #include "vpx_mem/vpx_mem.h"
@@ -123,40 +125,40 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
 
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
+    if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+      non_zero_count--;
+    else
+      break;
+  }
 
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      if (abs_coeff >= zbins[rc != 0]) {
-        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-               quant_shift_ptr[rc != 0]) >>
-              16;  // quantization
-        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+    if (abs_coeff >= zbins[rc != 0]) {
+      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+             quant_shift_ptr[rc != 0]) >>
+            16;  // quantization
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
 
-        if (tmp) eob = i;
-      }
+      if (tmp) eob = i;
     }
   }
   *eob_ptr = eob + 1;
@@ -174,39 +176,38 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
 
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
+    if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+      non_zero_count--;
+    else
+      break;
+  }
 
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      if (abs_coeff >= zbins[rc != 0]) {
-        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
-        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-        const uint32_t abs_qcoeff =
-            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-        if (abs_qcoeff) eob = i;
-      }
+    if (abs_coeff >= zbins[rc != 0]) {
+      const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+      if (abs_qcoeff) eob = i;
     }
   }
   *eob_ptr = eob + 1;
@@ -228,41 +229,40 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int idx_arr[1024];
   int i, eob = -1;
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
 
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
+    // If the coefficient is out of the base ZBIN range, keep it for
+    // quantization.
+    if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i;
+  }
 
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      int tmp;
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
-             quant_shift_ptr[rc != 0]) >>
-            15;
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = scan[idx_arr[i]];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    int tmp;
+    int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+    tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+           quant_shift_ptr[rc != 0]) >>
+          15;
 
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
 
-      if (tmp) eob = idx_arr[i];
-    }
+    if (tmp) eob = idx_arr[i];
   }
   *eob_ptr = eob + 1;
 }
@@ -282,38 +282,35 @@ void vpx_highbd_quantize_b_32x32_c(
   int idx_arr[1024];
   int i, eob = -1;
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
 
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
+    // If the coefficient is out of the base ZBIN range, keep it for
+    // quantization.
+    if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i;
+  }
 
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 =
-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      if (abs_qcoeff) eob = idx_arr[i];
-    }
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = scan[idx_arr[i]];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+    if (abs_qcoeff) eob = idx_arr[i];
   }
   *eob_ptr = eob + 1;
 }
diff --git a/libs/libvpx/vpx_dsp/sad.c b/libs/libvpx/vpx_dsp/sad.c
index c80ef729bf..18b6dc6e09 100644
--- a/libs/libvpx/vpx_dsp/sad.c
+++ b/libs/libvpx/vpx_dsp/sad.c
@@ -39,7 +39,7 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
   unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
                                         const uint8_t *ref, int ref_stride, \
                                         const uint8_t *second_pred) {       \
-    uint8_t comp_pred[m * n];                                               \
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                         \
     vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride);     \
     return sad(src, src_stride, comp_pred, m, m, n);                        \
   }
@@ -70,8 +70,6 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
 /* clang-format off */
 // 64x64
 sadMxN(64, 64)
-sadMxNxK(64, 64, 3)
-sadMxNxK(64, 64, 8)
 sadMxNx4D(64, 64)
 
 // 64x32
@@ -84,8 +82,6 @@ sadMxNx4D(32, 64)
 
 // 32x32
 sadMxN(32, 32)
-sadMxNxK(32, 32, 3)
-sadMxNxK(32, 32, 8)
 sadMxNx4D(32, 32)
 
 // 32x16
@@ -122,12 +118,10 @@ sadMxNx4D(8, 8)
 
 // 8x4
 sadMxN(8, 4)
-sadMxNxK(8, 4, 8)
 sadMxNx4D(8, 4)
 
 // 4x8
 sadMxN(4, 8)
-sadMxNxK(4, 8, 8)
 sadMxNx4D(4, 8)
 
 // 4x4
@@ -178,22 +172,11 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
   unsigned int vpx_highbd_sad##m##x##n##_avg_c(                                \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
       const uint8_t *second_pred) {                                            \
-    uint16_t comp_pred[m * n];                                                 \
+    DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]);                           \
     vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
     return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
   }
 
-#define highbd_sadMxNxK(m, n, k)                                             \
-  void vpx_highbd_sad##m##x##n##x##k##_c(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *ref_array,          \
-      int ref_stride, uint32_t *sad_array) {                                 \
-    int i;                                                                   \
-    for (i = 0; i < k; ++i) {                                                \
-      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride,            \
-                                                 &ref_array[i], ref_stride); \
-    }                                                                        \
-  }
-
 #define highbd_sadMxNx4D(m, n)                                               \
   void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,    \
                                       const uint8_t *const ref_array[],      \
@@ -208,8 +191,6 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
 /* clang-format off */
 // 64x64
 highbd_sadMxN(64, 64)
-highbd_sadMxNxK(64, 64, 3)
-highbd_sadMxNxK(64, 64, 8)
 highbd_sadMxNx4D(64, 64)
 
 // 64x32
@@ -222,8 +203,6 @@ highbd_sadMxNx4D(32, 64)
 
 // 32x32
 highbd_sadMxN(32, 32)
-highbd_sadMxNxK(32, 32, 3)
-highbd_sadMxNxK(32, 32, 8)
 highbd_sadMxNx4D(32, 32)
 
 // 32x16
@@ -236,42 +215,30 @@ highbd_sadMxNx4D(16, 32)
 
 // 16x16
 highbd_sadMxN(16, 16)
-highbd_sadMxNxK(16, 16, 3)
-highbd_sadMxNxK(16, 16, 8)
 highbd_sadMxNx4D(16, 16)
 
 // 16x8
 highbd_sadMxN(16, 8)
-highbd_sadMxNxK(16, 8, 3)
-highbd_sadMxNxK(16, 8, 8)
 highbd_sadMxNx4D(16, 8)
 
 // 8x16
 highbd_sadMxN(8, 16)
-highbd_sadMxNxK(8, 16, 3)
-highbd_sadMxNxK(8, 16, 8)
 highbd_sadMxNx4D(8, 16)
 
 // 8x8
 highbd_sadMxN(8, 8)
-highbd_sadMxNxK(8, 8, 3)
-highbd_sadMxNxK(8, 8, 8)
 highbd_sadMxNx4D(8, 8)
 
 // 8x4
 highbd_sadMxN(8, 4)
-highbd_sadMxNxK(8, 4, 8)
 highbd_sadMxNx4D(8, 4)
 
 // 4x8
 highbd_sadMxN(4, 8)
-highbd_sadMxNxK(4, 8, 8)
 highbd_sadMxNx4D(4, 8)
 
 // 4x4
 highbd_sadMxN(4, 4)
-highbd_sadMxNxK(4, 4, 3)
-highbd_sadMxNxK(4, 4, 8)
 highbd_sadMxNx4D(4, 4)
 /* clang-format on */
 
diff --git a/libs/libvpx/vpx_dsp/skin_detection.c b/libs/libvpx/vpx_dsp/skin_detection.c
new file mode 100644
index 0000000000..bbbb6c3a17
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/skin_detection.c
@@ -0,0 +1,79 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/skin_detection.h"
+
+#define MODEL_MODE 1
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[5][2] = { { 7463, 9614 },
+                                     { 6400, 10240 },
+                                     { 7040, 10240 },
+                                     { 8320, 9280 },
+                                     { 6800, 9614 } };
+static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 };  // q16
+static const int skin_threshold[6] = { 1570636, 1400000, 800000,
+                                       800000,  800000,  800000 };  // q18
+// Thresholds on luminance.
+static const int y_low = 40;
+static const int y_high = 220;
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int vpx_evaluate_skin_color_difference(const int cb, const int cr,
+                                              const int idx) {
+  const int cb_q6 = cb << 6;
+  const int cr_q6 = cr << 6;
+  const int cb_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
+  const int cbcr_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
+  const int cr_diff_q12 =
+      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
+  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+  const int skin_diff =
+      skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
+      skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
+  return skin_diff;
+}
+
+// Checks if the input yCbCr values corresponds to skin color.
+int vpx_skin_pixel(const int y, const int cb, const int cr, int motion) {
+  if (y < y_low || y > y_high) {
+    return 0;
+  } else if (MODEL_MODE == 0) {
+    return (vpx_evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
+  } else {
+    int i = 0;
+    // Exit on grey.
+    if (cb == 128 && cr == 128) return 0;
+    // Exit on very strong cb.
+    if (cb > 150 && cr < 110) return 0;
+    for (; i < 5; ++i) {
+      int skin_color_diff = vpx_evaluate_skin_color_difference(cb, cr, i);
+      if (skin_color_diff < skin_threshold[i + 1]) {
+        if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) {
+          return 0;
+        } else if (motion == 0 &&
+                   skin_color_diff > (skin_threshold[i + 1] >> 1)) {
+          return 0;
+        } else {
+          return 1;
+        }
+      }
+      // Exit if difference is much large than the threshold.
+      if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
+        return 0;
+      }
+    }
+    return 0;
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/skin_detection.h b/libs/libvpx/vpx_dsp/skin_detection.h
new file mode 100644
index 0000000000..a2e99baf7e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/skin_detection.h
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_SKIN_DETECTION_H_
+#define VPX_DSP_SKIN_DETECTION_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int vpx_skin_pixel(const int y, const int cb, const int cr, int motion);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_SKIN_DETECTION_H_
diff --git a/libs/libvpx/vpx_dsp/txfm_common.h b/libs/libvpx/vpx_dsp/txfm_common.h
index fd27f928ed..d01d7085a2 100644
--- a/libs/libvpx/vpx_dsp/txfm_common.h
+++ b/libs/libvpx/vpx_dsp/txfm_common.h
@@ -25,42 +25,42 @@
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
 //           round(16384 * cos(i*M_PI/64)));
 // Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const tran_high_t cospi_1_64 = 16364;
-static const tran_high_t cospi_2_64 = 16305;
-static const tran_high_t cospi_3_64 = 16207;
-static const tran_high_t cospi_4_64 = 16069;
-static const tran_high_t cospi_5_64 = 15893;
-static const tran_high_t cospi_6_64 = 15679;
-static const tran_high_t cospi_7_64 = 15426;
-static const tran_high_t cospi_8_64 = 15137;
-static const tran_high_t cospi_9_64 = 14811;
-static const tran_high_t cospi_10_64 = 14449;
-static const tran_high_t cospi_11_64 = 14053;
-static const tran_high_t cospi_12_64 = 13623;
-static const tran_high_t cospi_13_64 = 13160;
-static const tran_high_t cospi_14_64 = 12665;
-static const tran_high_t cospi_15_64 = 12140;
-static const tran_high_t cospi_16_64 = 11585;
-static const tran_high_t cospi_17_64 = 11003;
-static const tran_high_t cospi_18_64 = 10394;
-static const tran_high_t cospi_19_64 = 9760;
-static const tran_high_t cospi_20_64 = 9102;
-static const tran_high_t cospi_21_64 = 8423;
-static const tran_high_t cospi_22_64 = 7723;
-static const tran_high_t cospi_23_64 = 7005;
-static const tran_high_t cospi_24_64 = 6270;
-static const tran_high_t cospi_25_64 = 5520;
-static const tran_high_t cospi_26_64 = 4756;
-static const tran_high_t cospi_27_64 = 3981;
-static const tran_high_t cospi_28_64 = 3196;
-static const tran_high_t cospi_29_64 = 2404;
-static const tran_high_t cospi_30_64 = 1606;
-static const tran_high_t cospi_31_64 = 804;
+static const tran_coef_t cospi_1_64 = 16364;
+static const tran_coef_t cospi_2_64 = 16305;
+static const tran_coef_t cospi_3_64 = 16207;
+static const tran_coef_t cospi_4_64 = 16069;
+static const tran_coef_t cospi_5_64 = 15893;
+static const tran_coef_t cospi_6_64 = 15679;
+static const tran_coef_t cospi_7_64 = 15426;
+static const tran_coef_t cospi_8_64 = 15137;
+static const tran_coef_t cospi_9_64 = 14811;
+static const tran_coef_t cospi_10_64 = 14449;
+static const tran_coef_t cospi_11_64 = 14053;
+static const tran_coef_t cospi_12_64 = 13623;
+static const tran_coef_t cospi_13_64 = 13160;
+static const tran_coef_t cospi_14_64 = 12665;
+static const tran_coef_t cospi_15_64 = 12140;
+static const tran_coef_t cospi_16_64 = 11585;
+static const tran_coef_t cospi_17_64 = 11003;
+static const tran_coef_t cospi_18_64 = 10394;
+static const tran_coef_t cospi_19_64 = 9760;
+static const tran_coef_t cospi_20_64 = 9102;
+static const tran_coef_t cospi_21_64 = 8423;
+static const tran_coef_t cospi_22_64 = 7723;
+static const tran_coef_t cospi_23_64 = 7005;
+static const tran_coef_t cospi_24_64 = 6270;
+static const tran_coef_t cospi_25_64 = 5520;
+static const tran_coef_t cospi_26_64 = 4756;
+static const tran_coef_t cospi_27_64 = 3981;
+static const tran_coef_t cospi_28_64 = 3196;
+static const tran_coef_t cospi_29_64 = 2404;
+static const tran_coef_t cospi_30_64 = 1606;
+static const tran_coef_t cospi_31_64 = 804;
 
 //  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
-static const tran_high_t sinpi_1_9 = 5283;
-static const tran_high_t sinpi_2_9 = 9929;
-static const tran_high_t sinpi_3_9 = 13377;
-static const tran_high_t sinpi_4_9 = 15212;
+static const tran_coef_t sinpi_1_9 = 5283;
+static const tran_coef_t sinpi_2_9 = 9929;
+static const tran_coef_t sinpi_3_9 = 13377;
+static const tran_coef_t sinpi_4_9 = 15212;
 
 #endif  // VPX_DSP_TXFM_COMMON_H_
diff --git a/libs/libvpx/vpx_dsp/variance.c b/libs/libvpx/vpx_dsp/variance.c
index d1fa0560d5..93bd8f30de 100644
--- a/libs/libvpx/vpx_dsp/variance.c
+++ b/libs/libvpx/vpx_dsp/variance.c
@@ -164,7 +164,7 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
     var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
                                        bilinear_filters[yoffset]);      \
                                                                         \
-    vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W);              \
+    vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W);            \
                                                                         \
     return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse);       \
   }
@@ -294,7 +294,7 @@ static void highbd_12_variance(const uint8_t *a8, int a_stride,
                                               uint32_t *sse) {                 \
     int sum;                                                                   \
     highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
-    return *sse - (((int64_t)sum * sum) / (W * H));                            \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
   }                                                                            \
                                                                                \
   uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
@@ -463,8 +463,8 @@ static void highbd_var_filter_block2d_bil_second_pass(
     highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
                                               bilinear_filters[yoffset]);    \
                                                                              \
-    vpx_highbd_comp_avg_pred(temp3, second_pred, W, H,                       \
-                             CONVERT_TO_BYTEPTR(temp2), W);                  \
+    vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
+                               CONVERT_TO_BYTEPTR(temp2), W);                \
                                                                              \
     return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
                                               dst, dst_stride, sse);         \
@@ -483,8 +483,8 @@ static void highbd_var_filter_block2d_bil_second_pass(
     highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
                                               bilinear_filters[yoffset]);    \
                                                                              \
-    vpx_highbd_comp_avg_pred(temp3, second_pred, W, H,                       \
-                             CONVERT_TO_BYTEPTR(temp2), W);                  \
+    vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
+                               CONVERT_TO_BYTEPTR(temp2), W);                \
                                                                              \
     return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
                                                dst, dst_stride, sse);        \
@@ -503,8 +503,8 @@ static void highbd_var_filter_block2d_bil_second_pass(
     highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
                                               bilinear_filters[yoffset]);    \
                                                                              \
-    vpx_highbd_comp_avg_pred(temp3, second_pred, W, H,                       \
-                             CONVERT_TO_BYTEPTR(temp2), W);                  \
+    vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
+                               CONVERT_TO_BYTEPTR(temp2), W);                \
                                                                              \
     return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
                                                dst, dst_stride, sse);        \
diff --git a/libs/libvpx/vpx_dsp/variance.h b/libs/libvpx/vpx_dsp/variance.h
index 4c482551e0..100573299b 100644
--- a/libs/libvpx/vpx_dsp/variance.h
+++ b/libs/libvpx/vpx_dsp/variance.h
@@ -74,8 +74,6 @@ typedef struct vp9_variance_vtable {
   vpx_variance_fn_t vf;
   vpx_subpixvariance_fn_t svf;
   vpx_subp_avg_variance_fn_t svaf;
-  vpx_sad_multi_fn_t sdx3f;
-  vpx_sad_multi_fn_t sdx8f;
   vpx_sad_multi_d_fn_t sdx4df;
 } vp9_variance_fn_ptr_t;
 #endif  // CONFIG_VP9
diff --git a/libs/libvpx/vpx_dsp/vpx_convolve.c b/libs/libvpx/vpx_dsp/vpx_convolve.c
index f17281b369..e55a963f9d 100644
--- a/libs/libvpx/vpx_dsp/vpx_convolve.c
+++ b/libs/libvpx/vpx_dsp/vpx_convolve.c
@@ -25,6 +25,7 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
                            int x_step_q4, int w, int h) {
   int x, y;
   src -= SUBPEL_TAPS / 2 - 1;
+
   for (y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
     for (x = 0; x < w; ++x) {
@@ -46,6 +47,7 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
                                int x_step_q4, int w, int h) {
   int x, y;
   src -= SUBPEL_TAPS / 2 - 1;
+
   for (y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
     for (x = 0; x < w; ++x) {
@@ -72,7 +74,7 @@ static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
   for (x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
     for (y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
       int k, sum = 0;
       for (k = 0; k < SUBPEL_TAPS; ++k)
@@ -95,7 +97,7 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
   for (x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
     for (y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
       int k, sum = 0;
       for (k = 0; k < SUBPEL_TAPS; ++k)
@@ -111,11 +113,52 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-                     int x0_q4, int x_step_q4,
-                     const InterpKernel *const y_filters, int y0_q4,
-                     int y_step_q4, int w, int h) {
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+  convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
+                 h);
+}
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
+                               int h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                     w, h);
+}
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *filter, int x0_q4, int x_step_q4,
+                          int y0_q4, int y_step_q4, int w, int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+  convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+                h);
+}
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
+                    w, h);
+}
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -128,129 +171,49 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   // --Must round-up because block may be located at sub-pixel position.
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[135 * 64];
-  int intermediate_height =
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  uint8_t temp[64 * 135];
+  const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
   assert(w <= 64);
   assert(h <= 64);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
 
   convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
-                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
-                y_filters, y0_q4, y_step_q4, w, h);
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                 w, h);
-}
-
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                     x_step_q4, w, h);
-}
-
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4, int w,
-                          int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
-                w, h);
-}
-
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                    y_step_q4, w, h);
-}
-
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-           filters_y, y0_q4, y_step_q4, w, h);
+                 filter, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+                y0_q4, y_step_q4, w, h);
 }
 
 void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                          int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
+  // Fixed size intermediate buffer places limits on parameters.
   DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
   assert(w <= 64);
   assert(h <= 64);
 
-  vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,
+  vpx_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
                   y_step_q4, w, h);
-  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
 }
 
 void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int filter_x_stride, const int16_t *filter_y,
-                         int filter_y_stride, int w, int h) {
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                         int w, int h) {
   int r;
 
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   for (r = h; r > 0; --r) {
     memcpy(dst, src, w);
@@ -260,83 +223,81 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 }
 
 void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int filter_x_stride, const int16_t *filter_y,
-                        int filter_y_stride, int w, int h) {
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                        int w, int h) {
   int x, y;
 
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   for (y = 0; y < h; ++y) {
     for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-
     src += src_stride;
     dst += dst_stride;
   }
 }
 
 void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
-  vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+  vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                        int w, int h) {
-  vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                       filter_y, y_step_q4, w, h);
+  vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                       x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                  filter_y, y_step_q4, w, h);
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
+  vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                  y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
-  vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w, h);
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                          int w, int h) {
-  vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                      filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                      x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
-                                  uint8_t *dst8, ptrdiff_t dst_stride,
+static void highbd_convolve_horiz(const uint16_t *src, ptrdiff_t src_stride,
+                                  uint16_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *x_filters, int x0_q4,
                                   int x_step_q4, int w, int h, int bd) {
   int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= SUBPEL_TAPS / 2 - 1;
+
   for (y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
     for (x = 0; x < w; ++x) {
@@ -352,14 +313,13 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
-                                      uint8_t *dst8, ptrdiff_t dst_stride,
+static void highbd_convolve_avg_horiz(const uint16_t *src, ptrdiff_t src_stride,
+                                      uint16_t *dst, ptrdiff_t dst_stride,
                                       const InterpKernel *x_filters, int x0_q4,
                                       int x_step_q4, int w, int h, int bd) {
   int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= SUBPEL_TAPS / 2 - 1;
+
   for (y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
     for (x = 0; x < w; ++x) {
@@ -377,14 +337,13 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
-                                 uint8_t *dst8, ptrdiff_t dst_stride,
+static void highbd_convolve_vert(const uint16_t *src, ptrdiff_t src_stride,
+                                 uint16_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *y_filters, int y0_q4,
                                  int y_step_q4, int w, int h, int bd) {
   int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
   for (x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
     for (y = 0; y < h; ++y) {
@@ -402,14 +361,13 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
-                                     uint8_t *dst8, ptrdiff_t dst_stride,
+static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride,
+                                     uint16_t *dst, ptrdiff_t dst_stride,
                                      const InterpKernel *y_filters, int y0_q4,
                                      int y_step_q4, int w, int h, int bd) {
   int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
   for (x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
     for (y = 0; y < h; ++y) {
@@ -429,11 +387,11 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const InterpKernel *const x_filters, int x0_q4,
-                            int x_step_q4, const InterpKernel *const y_filters,
-                            int y0_q4, int y_step_q4, int w, int h, int bd) {
+static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride,
+                            uint16_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h, int bd) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -447,7 +405,7 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   uint16_t temp[64 * 135];
-  int intermediate_height =
+  const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
   assert(w <= 64);
@@ -456,112 +414,97 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
   assert(x_step_q4 <= 32);
 
   highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                        CONVERT_TO_BYTEPTR(temp), 64, x_filters, x0_q4,
-                        x_step_q4, w, intermediate_height, bd);
-  highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
-                       64, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h,
-                       bd);
+                        temp, 64, filter, x0_q4, x_step_q4, w,
+                        intermediate_height, bd);
+  highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+                       filter, y0_q4, y_step_q4, w, h, bd);
 }
 
-void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  (void)filter_y;
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
+                                  uint16_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h, int bd) {
+  (void)y0_q4;
   (void)y_step_q4;
 
-  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
                         x_step_q4, w, h, bd);
 }
 
-void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
+                                      uint16_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
                                       int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  (void)filter_y;
+  (void)y0_q4;
   (void)y_step_q4;
 
-  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
                             x_step_q4, w, h, bd);
 }
 
-void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride,
+                                 uint16_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
                                  int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  (void)filter_x;
+  (void)x0_q4;
   (void)x_step_q4;
 
-  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+  highbd_convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
                        y_step_q4, w, h, bd);
 }
 
-void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride,
+                                     uint16_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
                                      int w, int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  (void)filter_x;
+  (void)x0_q4;
   (void)x_step_q4;
 
-  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
                            y_step_q4, w, h, bd);
 }
 
-void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride,
+                            uint16_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                  filters_y, y0_q4, y_step_q4, w, h, bd);
+  highbd_convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                  y0_q4, y_step_q4, w, h, bd);
 }
 
-void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4, int w,
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride,
+                                uint16_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
                                 int h, int bd) {
   // Fixed size intermediate buffer places limits on parameters.
   DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
   assert(w <= 64);
   assert(h <= 64);
 
-  vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
-                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
-  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL,
-                            0, NULL, 0, w, h, bd);
+  vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4,
+                         y0_q4, y_step_q4, w, h, bd);
+  vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h,
+                            bd);
 }
 
-void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
-                                uint8_t *dst8, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int filter_x_stride,
-                                const int16_t *filter_y, int filter_y_stride,
-                                int w, int h, int bd) {
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
+                                uint16_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h, int bd) {
   int r;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
   (void)bd;
 
   for (r = h; r > 0; --r) {
@@ -571,24 +514,22 @@ void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
-                               uint8_t *dst8, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int filter_x_stride,
-                               const int16_t *filter_y, int filter_y_stride,
-                               int w, int h, int bd) {
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride,
+                               uint16_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
+                               int h, int bd) {
   int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
   (void)bd;
 
   for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-    }
+    for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
     src += src_stride;
     dst += dst_stride;
   }
diff --git a/libs/libvpx/vpx_dsp/vpx_convolve.h b/libs/libvpx/vpx_dsp/vpx_convolve.h
index ee9744b3ae..7979268a95 100644
--- a/libs/libvpx/vpx_dsp/vpx_convolve.h
+++ b/libs/libvpx/vpx_dsp/vpx_convolve.h
@@ -19,15 +19,15 @@ extern "C" {
 
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
                               int h);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
+typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride,
+                                     uint16_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
                                      int w, int h, int bd);
 #endif
 
diff --git a/libs/libvpx/vpx_dsp/vpx_dsp.mk b/libs/libvpx/vpx_dsp/vpx_dsp.mk
index a78041ce7b..3b1a873cd2 100644
--- a/libs/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/libs/libvpx/vpx_dsp/vpx_dsp.mk
@@ -13,6 +13,13 @@ DSP_SRCS-yes += vpx_dsp_common.h
 
 DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
 
+DSP_SRCS-$(HAVE_AVX2)   += x86/bitdepth_conversion_avx2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/bitdepth_conversion_sse2.h
+# This file is included in libs.mk. Including it here would cause it to be
+# compiled into an object. Even as an empty file, this would create an
+# executable section on the stack.
+#DSP_SRCS-$(HAVE_SSE2)   += x86/bitdepth_conversion_sse2$(ASM)
+
 # bit reader
 DSP_SRCS-yes += prob.h
 DSP_SRCS-yes += prob.c
@@ -43,11 +50,14 @@ DSP_SRCS-yes += intrapred.c
 DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
+DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_intrin_sse2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_intrin_ssse3.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
@@ -56,6 +66,7 @@ DSP_SRCS-yes += deblock.c
 DSP_SRCS-yes += postproc.h
 DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c
 DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c
+DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c
 DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm
 endif # CONFIG_POSTPROC
@@ -77,6 +88,8 @@ DSP_SRCS-yes += vpx_filter.h
 
 DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
 DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h
+DSP_SRCS-$(HAVE_AVX2) += x86/convolve_avx2.h
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_8t_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_bilinear_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
@@ -86,9 +99,15 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_8t_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_AVX2)  += x86/highbd_convolve_avx2.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_copy_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_avg_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve8_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_neon.c
 endif
 
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_convolve_copy_sse2.asm
+DSP_SRCS-$(HAVE_NEON)  += arm/vpx_scaled_convolve8_neon.c
 
 ifeq ($(HAVE_NEON_ASM),yes)
 DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
@@ -99,7 +118,6 @@ DSP_SRCS-yes += arm/vpx_convolve_neon.c
 else
 ifeq ($(HAVE_NEON),yes)
 DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c
-DSP_SRCS-yes += arm/vpx_convolve8_avg_neon.c
 DSP_SRCS-yes += arm/vpx_convolve8_neon.c
 DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
 DSP_SRCS-yes += arm/vpx_convolve_neon.c
@@ -130,6 +148,8 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_horiz_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_vert_dspr2.c
 
+DSP_SRCS-$(HAVE_VSX)  += ppc/vpx_convolve_vsx.c
+
 # loop filters
 DSP_SRCS-yes += loopfilter.c
 
@@ -137,15 +157,11 @@ DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
 DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
 
 ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes  += arm/loopfilter_neon.c
-DSP_SRCS-yes  += arm/loopfilter_mb_neon$(ASM)
 DSP_SRCS-yes  += arm/loopfilter_16_neon$(ASM)
 DSP_SRCS-yes  += arm/loopfilter_8_neon$(ASM)
 DSP_SRCS-yes  += arm/loopfilter_4_neon$(ASM)
 else
-ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes   += arm/loopfilter_mb_neon.c
-endif  # HAVE_NEON
+DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
 endif  # HAVE_NEON_ASM
 
 DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_msa.h
@@ -161,6 +177,7 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_loopfilter_neon.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
@@ -180,6 +197,10 @@ DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
 endif
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct16x16_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct32x32_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct_partial_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
@@ -193,48 +214,64 @@ DSP_SRCS-yes            += inv_txfm.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3_x86_64.asm
-endif  # ARCH_X86_64
+DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.h
+DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.c
 
-ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes  += arm/save_reg_neon$(ASM)
-DSP_SRCS-yes  += arm/idct4x4_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct4x4_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct8x8_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct8x8_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct16x16_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct16x16_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct32x32_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct32x32_add_neon$(ASM)
-else
-ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes  += arm/idct4x4_1_add_neon.c
-DSP_SRCS-yes  += arm/idct4x4_add_neon.c
-DSP_SRCS-yes  += arm/idct8x8_1_add_neon.c
-DSP_SRCS-yes  += arm/idct8x8_add_neon.c
-DSP_SRCS-yes  += arm/idct16x16_1_add_neon.c
-DSP_SRCS-yes  += arm/idct16x16_add_neon.c
-DSP_SRCS-yes  += arm/idct32x32_1_add_neon.c
-DSP_SRCS-yes  += arm/idct32x32_add_neon.c
-endif  # HAVE_NEON
-endif  # HAVE_NEON_ASM
-DSP_SRCS-$(HAVE_NEON)  += arm/idct16x16_neon.c
+DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
 
+DSP_SRCS-$(HAVE_VSX) += ppc/inv_txfm_vsx.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_MSA)   += mips/inv_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)   += mips/idct4x4_msa.c
 DSP_SRCS-$(HAVE_MSA)   += mips/idct8x8_msa.c
 DSP_SRCS-$(HAVE_MSA)   += mips/idct16x16_msa.c
 DSP_SRCS-$(HAVE_MSA)   += mips/idct32x32_msa.c
 
-ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
-endif  # CONFIG_VP9_HIGHBITDEPTH
+else  # CONFIG_VP9_HIGHBITDEPTH
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct4x4_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct8x8_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct16x16_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_34_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_135_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_1024_add_neon.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_inv_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct4x4_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct8x8_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct16x16_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct32x32_add_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_inv_txfm_sse4.h
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct8x8_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct16x16_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct32x32_add_sse4.c
+endif  # !CONFIG_VP9_HIGHBITDEPTH
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/idct_neon$(ASM)
+DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)
+else
+DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c
+endif  # HAVE_NEON_ASM
+DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
+DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c
+
 endif  # CONFIG_VP9
 
 # quantization
@@ -242,32 +279,39 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
 DSP_SRCS-yes            += quantize.c
 DSP_SRCS-yes            += quantize.h
 
+DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_x86.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
+DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.c
+DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx.c
+DSP_SRCS-$(HAVE_NEON)   += arm/quantize_neon.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
 endif
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3_x86_64.asm
-DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm
-endif
 
 # avg
 DSP_SRCS-yes           += avg.c
 DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
+DSP_SRCS-$(HAVE_AVX2)  += x86/avg_intrin_avx2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
-DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
 DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
+DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
 ifeq ($(ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
 endif
+DSP_SRCS-$(HAVE_VSX)   += ppc/hadamard_vsx.c
 
 endif  # CONFIG_VP9_ENCODER
 
+# skin detection
+DSP_SRCS-yes            += skin_detection.h
+DSP_SRCS-yes            += skin_detection.c
+
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
 DSP_SRCS-yes            += subtract.c
 DSP_SRCS-yes            += sum_squares.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
+DSP_SRCS-$(HAVE_MSA)    += mips/sum_squares_msa.c
 
 DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
@@ -276,11 +320,15 @@ DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
 
+DSP_SRCS-$(HAVE_MMI)    += mips/sad_mmi.c
+DSP_SRCS-$(HAVE_MMI)    += mips/subtract_mmi.c
+
 DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
 DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
 DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
+DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
 
 DSP_SRCS-$(HAVE_SSE)    += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE)    += x86/sad_sse2.asm
@@ -288,6 +336,8 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm
 
+DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c
+
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
@@ -299,16 +349,20 @@ ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)
 DSP_SRCS-yes            += variance.c
 DSP_SRCS-yes            += variance.h
 
+DSP_SRCS-$(HAVE_NEON)   += arm/avg_pred_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
 
 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
+DSP_SRCS-$(HAVE_MMI)    += mips/variance_mmi.c
+
 DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/avg_pred_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
+DSP_SRCS-$(HAVE_VSX)    += ppc/variance_vsx.c
 
 ifeq ($(ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/ssim_opt_x86_64.asm
@@ -325,7 +379,19 @@ endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 
 # Neon utilities
+DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h
+DSP_SRCS-$(HAVE_NEON) += arm/sum_neon.h
 DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h
+DSP_SRCS-$(HAVE_NEON) += arm/vpx_convolve8_neon.h
+
+# PPC VSX utilities
+DSP_SRCS-$(HAVE_VSX)  += ppc/types_vsx.h
+DSP_SRCS-$(HAVE_VSX)  += ppc/transpose_vsx.h
+DSP_SRCS-$(HAVE_VSX)  += ppc/bitdepth_conversion_vsx.h
+
+# X86 utilities
+DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h
+DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h
 
 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
 
diff --git a/libs/libvpx/vpx_dsp/vpx_dsp_common.h b/libs/libvpx/vpx_dsp/vpx_dsp_common.h
index 49d36e5458..c8c852374f 100644
--- a/libs/libvpx/vpx_dsp/vpx_dsp_common.h
+++ b/libs/libvpx/vpx_dsp/vpx_dsp_common.h
@@ -43,6 +43,8 @@ typedef int32_t tran_high_t;
 typedef int16_t tran_low_t;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+typedef int16_t tran_coef_t;
+
 static INLINE uint8_t clip_pixel(int val) {
   return (val > 255) ? 255 : (val < 0) ? 0 : val;
 }
@@ -55,7 +57,6 @@ static INLINE double fclamp(double value, double low, double high) {
   return value < low ? low : (value > high ? high : value);
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
 static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
   switch (bd) {
     case 8:
@@ -64,7 +65,6 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
     case 12: return (uint16_t)clamp(val, 0, 4095);
   }
 }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 9fea2d1cf2..1a743d910e 100644
--- a/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
 sub vpx_dsp_forward_decls() {
 print <<EOF
 /*
@@ -6,17 +16,12 @@ print <<EOF
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 
 EOF
 }
 forward_decls qw/vpx_dsp_forward_decls/;
 
-# optimizations which depend on multiple features
-$avx2_ssse3 = '';
-if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
-  $avx2_ssse3 = 'avx2';
-}
-
 # functions that are 64 bit only.
 $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
 if ($opts{arch} eq "x86_64") {
@@ -25,6 +30,7 @@ if ($opts{arch} eq "x86_64") {
   $ssse3_x86_64 = 'ssse3';
   $avx_x86_64 = 'avx';
   $avx2_x86_64 = 'avx2';
+  $avx512_x86_64 = 'avx512';
 }
 
 #
@@ -34,32 +40,22 @@ if ($opts{arch} eq "x86_64") {
 add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_4x4 sse2/;
 
-add_proto qw/void vpx_d207e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207e_predictor_4x4/;
-
 add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_4x4 neon sse2/;
 
 add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45e_predictor_4x4/;
 
 add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d63_predictor_4x4 ssse3/;
 
 add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63e_predictor_4x4/;
-
-add_proto qw/void vpx_d63f_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63f_predictor_4x4/;
 
 add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2/;
+specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2 vsx/;
 
 add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_he_predictor_4x4/;
 
 add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d117_predictor_4x4/;
 
 add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_4x4 neon/;
@@ -71,10 +67,9 @@ add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const
 specialize qw/vpx_v_predictor_4x4 neon msa sse2/;
 
 add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_ve_predictor_4x4/;
 
 add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2/;
+specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2 vsx/;
 
 add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon sse2/;
@@ -91,29 +86,19 @@ specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/;
 add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_8x8 ssse3/;
 
-add_proto qw/void vpx_d207e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207e_predictor_8x8/;
-
 add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_8x8 neon sse2/;
-
-add_proto qw/void vpx_d45e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45e_predictor_8x8/;
+specialize qw/vpx_d45_predictor_8x8 neon sse2 vsx/;
 
 add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_8x8 ssse3/;
-
-add_proto qw/void vpx_d63e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63e_predictor_8x8/;
+specialize qw/vpx_d63_predictor_8x8 ssse3 vsx/;
 
 add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/;
+specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2 vsx/;
 
 add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d117_predictor_8x8/;
 
 add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d135_predictor_8x8/;
+specialize qw/vpx_d135_predictor_8x8 neon/;
 
 add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d153_predictor_8x8 ssse3/;
@@ -122,10 +107,10 @@ add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const
 specialize qw/vpx_v_predictor_8x8 neon msa sse2/;
 
 add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2/;
+specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2 vsx/;
 
 add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2/;
+specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/;
@@ -139,366 +124,305 @@ specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/;
 add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_16x16 ssse3/;
 
-add_proto qw/void vpx_d207e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207e_predictor_16x16/;
-
 add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_16x16 neon ssse3/;
-
-add_proto qw/void vpx_d45e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45e_predictor_16x16/;
+specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/;
 
 add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_16x16 ssse3/;
-
-add_proto qw/void vpx_d63e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63e_predictor_16x16/;
+specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/;
 
 add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2/;
+specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;
 
 add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d117_predictor_16x16/;
 
 add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d135_predictor_16x16/;
+specialize qw/vpx_d135_predictor_16x16 neon/;
 
 add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d153_predictor_16x16 ssse3/;
 
 add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_16x16 neon msa sse2/;
+specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_16x16 neon msa sse2/;
+specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2/;
+specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2/;
+specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2/;
+specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2/;
+specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_32x32 ssse3/;
 
-add_proto qw/void vpx_d207e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207e_predictor_32x32/;
-
 add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_32x32 ssse3/;
-
-add_proto qw/void vpx_d45e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45e_predictor_32x32/;
+specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/;
 
 add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_32x32 ssse3/;
-
-add_proto qw/void vpx_d63e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63e_predictor_32x32/;
+specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/;
 
 add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_32x32 neon msa sse2/;
+specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d117_predictor_32x32/;
 
 add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d135_predictor_32x32/;
+specialize qw/vpx_d135_predictor_32x32 neon/;
 
 add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d153_predictor_32x32 ssse3/;
 
 add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_32x32 neon msa sse2/;
+specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_32x32 neon msa sse2/;
+specialize qw/vpx_tm_predictor_32x32 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_32x32 msa neon sse2/;
+specialize qw/vpx_dc_predictor_32x32 msa neon sse2 vsx/;
 
 add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2/;
+specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2 vsx/;
 
 add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2/;
+specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2 vsx/;
 
 add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2/;
+specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/;
 
 # High bitdepth functions
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_4x4/;
-
-  add_proto qw/void vpx_highbd_d207e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207e_predictor_4x4/;
+  specialize qw/vpx_highbd_d207_predictor_4x4 sse2/;
 
   add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45_predictor_4x4/;
-
-  add_proto qw/void vpx_highbd_d45e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45e_predictor_4x4/;
+  specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_4x4/;
-
-  add_proto qw/void vpx_highbd_d63e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63e_predictor_4x4/;
+  specialize qw/vpx_highbd_d63_predictor_4x4 sse2/;
 
   add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_h_predictor_4x4/;
+  specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_4x4/;
+  specialize qw/vpx_highbd_d117_predictor_4x4 sse2/;
 
   add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d135_predictor_4x4/;
+  specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_4x4/;
+  specialize qw/vpx_highbd_d153_predictor_4x4 sse2/;
 
   add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_top_predictor_4x4/;
+  specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_left_predictor_4x4/;
+  specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_128_predictor_4x4/;
+  specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_8x8/;
-
-  add_proto qw/void vpx_highbd_d207e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207e_predictor_8x8/;
+  specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45_predictor_8x8/;
-
-  add_proto qw/void vpx_highbd_d45e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45e_predictor_8x8/;
+  specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_8x8/;
-
-  add_proto qw/void vpx_highbd_d63e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63e_predictor_8x8/;
+  specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_h_predictor_8x8/;
+  specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_8x8/;
+  specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d135_predictor_8x8/;
+  specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_8x8/;
+  specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_8x8 sse2/;
+  specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_8x8 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_8x8 sse2/;;
+  specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_top_predictor_8x8/;
+  specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_left_predictor_8x8/;
+  specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_128_predictor_8x8/;
+  specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_16x16/;
-
-  add_proto qw/void vpx_highbd_d207e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207e_predictor_16x16/;
+  specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45_predictor_16x16/;
-
-  add_proto qw/void vpx_highbd_d45e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45e_predictor_16x16/;
+  specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_16x16/;
-
-  add_proto qw/void vpx_highbd_d63e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63e_predictor_16x16/;
+  specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_h_predictor_16x16/;
+  specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_16x16/;
+  specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d135_predictor_16x16/;
+  specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_16x16/;
+  specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_16x16 sse2/;
+  specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_16x16 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_16x16 sse2/;
+  specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_top_predictor_16x16/;
+  specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_left_predictor_16x16/;
+  specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_128_predictor_16x16/;
+  specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_32x32/;
-
-  add_proto qw/void vpx_highbd_d207e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207e_predictor_32x32/;
+  specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45_predictor_32x32/;
-
-  add_proto qw/void vpx_highbd_d45e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45e_predictor_32x32/;
+  specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_32x32/;
-
-  add_proto qw/void vpx_highbd_d63e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63e_predictor_32x32/;
+  specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_h_predictor_32x32/;
+  specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_32x32/;
+  specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d135_predictor_32x32/;
+  specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_32x32/;
+  specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_32x32 sse2/;
+  specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_32x32 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_32x32 sse2/;
+  specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_top_predictor_32x32/;
+  specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_left_predictor_32x32/;
+  specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_128_predictor_32x32/;
+  specialize qw/vpx_highbd_dc_128_predictor_32x32 neon sse2/;
 }  # CONFIG_VP9_HIGHBITDEPTH
 
 #
 # Sub Pixel Filters
 #
-add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_copy neon dspr2 msa sse2/;
+add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/;
 
-add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_avg neon dspr2 msa sse2/;
+add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx/;
 
-add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
+add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
-add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
+add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
-add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
+add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
-add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa/;
+add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
-add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/;
+add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
-add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/;
+add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
-add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_2d ssse3/;
+add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_2d ssse3 neon msa/;
 
-add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_horiz/;
+add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
-add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_vert/;
+add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
-add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_2d/;
+add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
-add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_horiz/;
+add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
-add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_vert/;
+add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Sub Pixel Filters
   #
-  add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve_copy sse2/;
+  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;
 
-  add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve_avg sse2/;
+  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;
 
-  add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64";
+  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_horiz/, "$sse2_x86_64";
+  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_vert/, "$sse2_x86_64";
+  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_avg/, "$sse2_x86_64";
+  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
+  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_avg_vert/, "$sse2_x86_64";
+  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";
 }  # CONFIG_VP9_HIGHBITDEPTH
 
 #
@@ -522,11 +446,11 @@ specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
 add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
 
-add_proto qw/void vpx_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_edge_8 sse2 avx2 neon dspr2 msa/;
+add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon dspr2 msa/;
 
-add_proto qw/void vpx_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_edge_16 sse2 avx2 neon dspr2 msa/;
+add_proto qw/void vpx_lpf_horizontal_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_horizontal_16_dual sse2 avx2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
@@ -542,40 +466,40 @@ specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_vertical_16 sse2/;
+  specialize qw/vpx_highbd_lpf_vertical_16 sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_vertical_16_dual sse2/;
+  specialize qw/vpx_highbd_lpf_vertical_16_dual sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_vertical_8 sse2/;
+  specialize qw/vpx_highbd_lpf_vertical_8 sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vpx_highbd_lpf_vertical_8_dual sse2/;
+  specialize qw/vpx_highbd_lpf_vertical_8_dual sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_vertical_4 sse2/;
+  specialize qw/vpx_highbd_lpf_vertical_4 sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vpx_highbd_lpf_vertical_4_dual sse2/;
+  specialize qw/vpx_highbd_lpf_vertical_4_dual sse2 neon/;
 
-  add_proto qw/void vpx_highbd_lpf_horizontal_edge_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_edge_8 sse2/;
+  add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_16 sse2 neon/;
 
-  add_proto qw/void vpx_highbd_lpf_horizontal_edge_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_edge_16 sse2/;
+  add_proto qw/void vpx_highbd_lpf_horizontal_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_16_dual sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_8 sse2/;
+  specialize qw/vpx_highbd_lpf_horizontal_8 sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/;
+  specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_4 sse2/;
+  specialize qw/vpx_highbd_lpf_horizontal_4 sse2 neon/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2/;
+  specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2 neon/;
 }  # CONFIG_VP9_HIGHBITDEPTH
 
 #
@@ -588,31 +512,31 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4 sse2/;
+  specialize qw/vpx_fdct4x4 neon sse2/;
 
   add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4_1 sse2/;
+  specialize qw/vpx_fdct4x4_1 sse2 neon/;
 
   add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct8x8 sse2/;
+  specialize qw/vpx_fdct8x8 neon sse2/;
 
   add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct8x8_1 sse2/;
+  specialize qw/vpx_fdct8x8_1 neon sse2 msa/;
 
   add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16 sse2/;
+  specialize qw/vpx_fdct16x16 neon sse2/;
 
   add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16_1 sse2/;
+  specialize qw/vpx_fdct16x16_1 sse2 neon/;
 
   add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32 sse2/;
+  specialize qw/vpx_fdct32x32 neon sse2/;
 
   add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_rd sse2/;
+  specialize qw/vpx_fdct32x32_rd neon sse2/;
 
   add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_1 sse2/;
+  specialize qw/vpx_fdct32x32_1 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_highbd_fdct4x4 sse2/;
@@ -621,13 +545,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_fdct8x8 sse2/;
 
   add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct8x8_1/;
+  specialize qw/vpx_highbd_fdct8x8_1 neon/;
+  $vpx_highbd_fdct8x8_1_neon=vpx_fdct8x8_1_neon;
 
   add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_highbd_fdct16x16 sse2/;
 
   add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct16x16_1/;
 
   add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_highbd_fdct32x32 sse2/;
@@ -636,13 +560,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_fdct32x32_rd sse2/;
 
   add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct32x32_1/;
 } else {
   add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4 sse2 msa/;
+  specialize qw/vpx_fdct4x4 neon sse2 msa/;
 
   add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4_1 sse2/;
+  specialize qw/vpx_fdct4x4_1 sse2 neon/;
 
   add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
@@ -651,258 +574,123 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16 sse2 msa/;
+  specialize qw/vpx_fdct16x16 neon sse2 msa/;
 
   add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16_1 sse2 msa/;
+  specialize qw/vpx_fdct16x16_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32 sse2 avx2 msa/;
+  specialize qw/vpx_fdct32x32 neon sse2 avx2 msa/;
 
   add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_rd sse2 avx2 msa/;
+  specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa/;
 
   add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_1 sse2 msa/;
+  specialize qw/vpx_fdct32x32_1 sse2 neon msa/;
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
 
 #
 # Inverse transform
 if (vpx_config("CONFIG_VP9") eq "yes") {
+
+add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+
+if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+  # Note that there are more specializations appended when
+  # CONFIG_VP9_HIGHBITDEPTH is off.
+  specialize qw/vpx_idct4x4_16_add neon sse2 vsx/;
+  specialize qw/vpx_idct4x4_1_add neon sse2/;
+  specialize qw/vpx_idct8x8_64_add neon sse2 vsx/;
+  specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/;
+  specialize qw/vpx_idct8x8_1_add neon sse2/;
+  specialize qw/vpx_idct16x16_256_add neon sse2 vsx/;
+  specialize qw/vpx_idct16x16_38_add neon sse2/;
+  specialize qw/vpx_idct16x16_10_add neon sse2/;
+  specialize qw/vpx_idct16x16_1_add neon sse2/;
+  specialize qw/vpx_idct32x32_1024_add neon sse2 vsx/;
+  specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
+  specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
+  specialize qw/vpx_idct32x32_1_add neon sse2/;
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
+    # Note that these specializations are appended to the above ones.
+    specialize qw/vpx_idct4x4_16_add dspr2 msa/;
+    specialize qw/vpx_idct4x4_1_add dspr2 msa/;
+    specialize qw/vpx_idct8x8_64_add dspr2 msa/;
+    specialize qw/vpx_idct8x8_12_add dspr2 msa/;
+    specialize qw/vpx_idct8x8_1_add dspr2 msa/;
+    specialize qw/vpx_idct16x16_256_add dspr2 msa/;
+    specialize qw/vpx_idct16x16_38_add dspr2 msa/;
+    $vpx_idct16x16_38_add_dspr2=vpx_idct16x16_256_add_dspr2;
+    $vpx_idct16x16_38_add_msa=vpx_idct16x16_256_add_msa;
+    specialize qw/vpx_idct16x16_10_add dspr2 msa/;
+    specialize qw/vpx_idct16x16_1_add dspr2 msa/;
+    specialize qw/vpx_idct32x32_1024_add dspr2 msa/;
+    specialize qw/vpx_idct32x32_135_add dspr2 msa/;
+    $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
+    $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
+    specialize qw/vpx_idct32x32_34_add dspr2 msa/;
+    specialize qw/vpx_idct32x32_1_add dspr2 msa/;
+    specialize qw/vpx_iwht4x4_16_add msa sse2/;
+    specialize qw/vpx_iwht4x4_1_add msa/;
+  } # !CONFIG_VP9_HIGHBITDEPTH
+}  # !CONFIG_EMULATE_HARDWARE
+
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Note as optimized versions of these functions are added we need to add a check to ensure
   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
-  add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_iwht4x4_1_add/;
-
-  add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
   specialize qw/vpx_iwht4x4_16_add sse2/;
 
-  add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vpx_highbd_idct4x4_1_add/;
-
-  add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vpx_highbd_idct8x8_1_add/;
-
-  add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vpx_highbd_idct16x16_1_add/;
-
-  add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vpx_highbd_idct32x32_1024_add/;
-
-  add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vpx_highbd_idct32x32_34_add/;
-
-  add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vpx_highbd_idct32x32_1_add sse2/;
-
-  add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vpx_highbd_iwht4x4_1_add/;
-
-  add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vpx_highbd_iwht4x4_16_add/;
-
-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
-  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct4x4_16_add/;
-
-    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct4x4_1_add/;
-
-    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_64_add/;
-
-    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_12_add/;
-
-    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_1_add/;
-
-    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_256_add/;
-
-    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_10_add/;
-
-    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_1_add/;
-
-    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1024_add/;
-
-    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_135_add/;
-
-    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_34_add/;
-
-    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1_add/;
-
-    add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct4x4_16_add/;
-
-    add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct8x8_64_add/;
-
-    add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct8x8_10_add/;
-
-    add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct16x16_256_add/;
-
-    add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct16x16_10_add/;
-  } else {
-    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct4x4_16_add sse2/;
-
-    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct4x4_1_add sse2/;
-
-    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_1_add sse2/;
-
-    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_256_add sse2/;
-
-    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_10_add sse2/;
-
-    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_1_add sse2/;
-
-    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64";
-    # Need to add 135 eob idct32x32 implementations.
-    $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
-
-    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1_add sse2/;
-
-    add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct4x4_16_add sse2/;
-
-    add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct8x8_64_add sse2/;
-
-    add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct8x8_10_add sse2/;
-
-    add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct16x16_256_add sse2/;
-
-    add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct16x16_10_add sse2/;
-  }  # CONFIG_EMULATE_HARDWARE
-} else {
-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
-  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct4x4_1_add/;
-
-    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct4x4_16_add/;
-
-    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_1_add/;
-
-    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_64_add/;
-
-    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_12_add/;
-
-    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_1_add/;
-
-    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_256_add/;
-
-    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_10_add/;
-
-    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1024_add/;
-
-    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_135_add/;
-
-    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_34_add/;
-
-    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1_add/;
-
-    add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_iwht4x4_1_add/;
-
-    add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_iwht4x4_16_add/;
-  } else {
-    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct4x4_1_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct4x4_16_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_1_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_1_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_256_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
-    # Need to add 135 eob idct32x32 implementations.
-    $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
-    $vpx_idct32x32_135_add_neon=vpx_idct32x32_1024_add_neon;
-    $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
-    $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
-
-    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
-    # Need to add 34 eob idct32x32 neon implementation.
-    $vpx_idct32x32_34_add_neon=vpx_idct32x32_1024_add_neon;
-
-    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_iwht4x4_1_add msa/;
-
-    add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_iwht4x4_16_add msa sse2/;
-  }  # CONFIG_EMULATE_HARDWARE
+  add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  specialize qw/vpx_highbd_idct4x4_1_add neon sse2/;
+
+  add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  specialize qw/vpx_highbd_idct8x8_1_add neon sse2/;
+
+  add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  specialize qw/vpx_highbd_idct16x16_1_add neon sse2/;
+
+  add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  specialize qw/vpx_highbd_idct32x32_1_add neon sse2/;
+
+  add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+    specialize qw/vpx_highbd_idct4x4_16_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct8x8_64_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct8x8_12_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct16x16_256_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct16x16_38_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct16x16_10_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct32x32_1024_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct32x32_135_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct32x32_34_add neon sse2 sse4_1/;
+  }  # !CONFIG_EMULATE_HARDWARE
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9
 
@@ -911,10 +699,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
+  specialize qw/vpx_quantize_b neon sse2 ssse3 avx/;
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
+  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
@@ -930,49 +718,49 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 # Block subtraction
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa sse2/;
+specialize qw/vpx_subtract_block neon msa mmi sse2/;
 
 #
 # Single block SAD
 #
 add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 avx2 neon msa sse2/;
+specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x32 avx2 msa sse2/;
+specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x64 avx2 msa sse2/;
+specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x32 avx2 neon msa sse2/;
+specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x16 avx2 msa sse2/;
+specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x32 msa sse2/;
+specialize qw/vpx_sad16x32 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 neon msa sse2/;
+specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 neon msa sse2/;
+specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x16 neon msa sse2/;
+specialize qw/vpx_sad8x16 neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 neon msa sse2/;
+specialize qw/vpx_sad8x8 neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x4 msa sse2/;
+specialize qw/vpx_sad8x4 neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x8 msa sse2/;
+specialize qw/vpx_sad4x8 neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x4 neon msa sse2/;
+specialize qw/vpx_sad4x4 neon msa sse2 mmi/;
 
 #
 # Avg
@@ -985,163 +773,156 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   specialize qw/vpx_avg_4x4 sse2 neon msa/;
 
   add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-  specialize qw/vpx_minmax_8x8 sse2 neon/;
+  specialize qw/vpx_minmax_8x8 sse2 neon msa/;
 
-  add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-  specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64";
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64";
 
-  add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-  specialize qw/vpx_hadamard_16x16 sse2 neon/;
+    add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
 
-  add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
-  specialize qw/vpx_satd sse2 neon/;
+    add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
+    specialize qw/vpx_satd avx2 sse2 neon/;
+  } else {
+    add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+    specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
+
+    add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
+
+    add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
+    specialize qw/vpx_satd avx2 sse2 neon msa/;
+  }
 
   add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
-  specialize qw/vpx_int_pro_row sse2 neon/;
+  specialize qw/vpx_int_pro_row sse2 neon msa/;
 
   add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
-  specialize qw/vpx_int_pro_col sse2 neon/;
+  specialize qw/vpx_int_pro_col sse2 neon msa/;
 
   add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
-  specialize qw/vpx_vector_var neon sse2/;
+  specialize qw/vpx_vector_var neon sse2 msa/;
 }  # CONFIG_VP9_ENCODER
 
 add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x64_avg avx2 msa sse2/;
+specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x32_avg avx2 msa sse2/;
+specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x64_avg avx2 msa sse2/;
+specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x32_avg avx2 msa sse2/;
+specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x16_avg avx2 msa sse2/;
+specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x32_avg msa sse2/;
+specialize qw/vpx_sad16x32_avg neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x16_avg msa sse2/;
+specialize qw/vpx_sad16x16_avg neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x8_avg msa sse2/;
+specialize qw/vpx_sad16x8_avg neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x16_avg msa sse2/;
+specialize qw/vpx_sad8x16_avg neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x8_avg msa sse2/;
+specialize qw/vpx_sad8x8_avg neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x4_avg msa sse2/;
+specialize qw/vpx_sad8x4_avg neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x8_avg msa sse2/;
+specialize qw/vpx_sad4x8_avg neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x4_avg msa sse2/;
+specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/;
 
 #
 # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
 #
 # Blocks of 3
-add_proto qw/void vpx_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x3 msa/;
-
-add_proto qw/void vpx_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x3 msa/;
-
 add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x3 sse3 ssse3 msa/;
+specialize qw/vpx_sad16x16x3 sse3 ssse3 msa mmi/;
 
 add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x3 sse3 ssse3 msa/;
+specialize qw/vpx_sad16x8x3 sse3 ssse3 msa mmi/;
 
 add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x3 sse3 msa/;
+specialize qw/vpx_sad8x16x3 sse3 msa mmi/;
 
 add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x3 sse3 msa/;
+specialize qw/vpx_sad8x8x3 sse3 msa mmi/;
 
 add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x3 sse3 msa/;
+specialize qw/vpx_sad4x4x3 sse3 msa mmi/;
 
 # Blocks of 8
-add_proto qw/void vpx_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x8 msa/;
-
-add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x8 msa/;
-
 add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x8 sse4_1 msa/;
+specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/;
 
 add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x8 sse4_1 msa/;
+specialize qw/vpx_sad16x8x8 sse4_1 msa mmi/;
 
 add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x8 sse4_1 msa/;
+specialize qw/vpx_sad8x16x8 sse4_1 msa mmi/;
 
 add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x4x8 msa/;
-
-add_proto qw/void vpx_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x8 msa/;
+specialize qw/vpx_sad8x8x8 sse4_1 msa mmi/;
 
 add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x8 sse4_1 msa/;
+specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/;
 
 #
 # Multi-block SAD, comparing a reference to N independent blocks
 #
 add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x4d avx2 neon msa sse2/;
+specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x32x4d msa sse2/;
+specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x64x4d msa sse2/;
+specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x4d avx2 neon msa sse2/;
+specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x16x4d msa sse2/;
+specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x32x4d msa sse2/;
+specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x4d neon msa sse2/;
+specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x4d msa sse2/;
+specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x4d msa sse2/;
+specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
 
 add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x4d msa sse2/;
+specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/;
 
 add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x4x4d msa sse2/;
+specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
 
 add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x4d msa sse2/;
+specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
 
 add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x4d msa sse2/;
+specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
 
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
-specialize qw/vpx_sum_squares_2d_i16 sse2/;
+specialize qw/vpx_sum_squares_2d_i16 sse2 msa/;
 
 #
 # Structured Similarity (SSIM)
@@ -1159,7 +940,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Block subtraction
   #
   add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-  specialize qw/vpx_highbd_subtract_block/;
 
   #
   # Single block SAD
@@ -1198,20 +978,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_sad8x4 sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad4x8/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad4x4/;
 
   #
   # Avg
   #
   add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
-  specialize qw/vpx_highbd_avg_8x8/;
   add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
-  specialize qw/vpx_highbd_avg_4x4/;
   add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-  specialize qw/vpx_highbd_minmax_8x8/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
   specialize qw/vpx_highbd_sad64x64_avg sse2/;
@@ -1247,63 +1022,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_sad8x4_avg sse2/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad4x8_avg/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad4x4_avg/;
-
-  #
-  # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-  #
-  # Blocks of 3
-  add_proto qw/void vpx_highbd_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x64x3/;
-
-  add_proto qw/void vpx_highbd_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x32x3/;
-
-  add_proto qw/void vpx_highbd_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x16x3/;
-
-  add_proto qw/void vpx_highbd_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x8x3/;
-
-  add_proto qw/void vpx_highbd_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x16x3/;
-
-  add_proto qw/void vpx_highbd_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x8x3/;
-
-  add_proto qw/void vpx_highbd_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x4x3/;
-
-  # Blocks of 8
-  add_proto qw/void vpx_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x64x8/;
-
-  add_proto qw/void vpx_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x32x8/;
-
-  add_proto qw/void vpx_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x16x8/;
-
-  add_proto qw/void vpx_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x8x8/;
-
-  add_proto qw/void vpx_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x16x8/;
-
-  add_proto qw/void vpx_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x8x8/;
-
-  add_proto qw/void vpx_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x4x8/;
-
-  add_proto qw/void vpx_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x8x8/;
-
-  add_proto qw/void vpx_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x4x8/;
 
   #
   # Multi-block SAD, comparing a reference to N independent blocks
@@ -1352,7 +1072,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
     add_proto qw/void vpx_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/vpx_highbd_ssim_parms_8x8/;
   }
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_ENCODERS
@@ -1363,43 +1082,43 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "
 # Variance
 #
 add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
+  specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x32 sse2 avx2 neon msa/;
+  specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x64 sse2 neon msa/;
+  specialize qw/vpx_variance32x64 sse2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
+  specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x16 sse2 avx2 msa/;
+  specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x32 sse2 msa/;
+  specialize qw/vpx_variance16x32 sse2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 sse2 avx2 neon msa/;
+  specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x8 sse2 neon msa/;
+  specialize qw/vpx_variance16x8 sse2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x16 sse2 neon msa/;
+  specialize qw/vpx_variance8x16 sse2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 sse2 neon msa/;
+  specialize qw/vpx_variance8x8 sse2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x4 sse2 msa/;
+  specialize qw/vpx_variance8x4 sse2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x8 sse2 msa/;
+  specialize qw/vpx_variance4x8 sse2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x4 sse2 msa/;
+  specialize qw/vpx_variance4x4 sse2 neon msa mmi/;
 
 #
 # Specialty Variance
@@ -1411,105 +1130,106 @@ add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, co
   specialize qw/vpx_get8x8var sse2 neon msa/;
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 sse2 avx2 neon msa/;
+  specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x8 sse2 msa/;
+  specialize qw/vpx_mse16x8 sse2 msa mmi/;
 
 add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x16 sse2 msa/;
+  specialize qw/vpx_mse8x16 sse2 msa mmi/;
 
 add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x8 sse2 msa/;
+  specialize qw/vpx_mse8x8 sse2 msa mmi/;
 
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
-  specialize qw/vpx_get_mb_ss sse2 msa/;
+  specialize qw/vpx_get_mb_ss sse2 msa vsx/;
 
 add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
-  specialize qw/vpx_get4x4sse_cs neon msa/;
+  specialize qw/vpx_get4x4sse_cs neon msa vsx/;
 
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+  specialize qw/vpx_comp_avg_pred neon sse2 vsx/;
 
 #
 # Subpixel Variance
 #
 add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance64x32 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance64x32 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x64 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x16 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x32 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x16 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x8 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x16 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x8 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x4 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x8 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance4x8 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x4 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa mmi sse2 ssse3/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
@@ -1878,15 +1598,13 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC")
     specialize qw/vpx_plane_add_noise sse2 msa/;
 
     add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
-    specialize qw/vpx_mbpost_proc_down sse2 msa/;
-    $vpx_mbpost_proc_down_sse2=vpx_mbpost_proc_down_xmm;
+    specialize qw/vpx_mbpost_proc_down sse2 neon msa/;
 
     add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
-    specialize qw/vpx_mbpost_proc_across_ip sse2 msa/;
-    $vpx_mbpost_proc_across_ip_sse2=vpx_mbpost_proc_across_ip_xmm;
+    specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa/;
 
     add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
-    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 msa/;
+    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/;
 
 }
 
diff --git a/libs/libvpx/vpx_dsp/x86/add_noise_sse2.asm b/libs/libvpx/vpx_dsp/x86/add_noise_sse2.asm
index f758da22dc..80cced4ced 100644
--- a/libs/libvpx/vpx_dsp/x86/add_noise_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/add_noise_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise,
 ;                              int blackclamp, int whiteclamp,
 ;                              int width, int height, int pitch)
@@ -26,13 +28,13 @@ sym(vpx_plane_add_noise_sse2):
     mov         rdx, 0x01010101
     mov         rax, arg(2)
     mul         rdx
-    movd        xmm3, rax
+    movq        xmm3, rax
     pshufd      xmm3, xmm3, 0  ; xmm3 is 16 copies of char in blackclamp
 
     mov         rdx, 0x01010101
     mov         rax, arg(3)
     mul         rdx
-    movd        xmm4, rax
+    movq        xmm4, rax
     pshufd      xmm4, xmm4, 0  ; xmm4 is 16 copies of char in whiteclamp
 
     movdqu      xmm5, xmm3     ; both clamp = black clamp + white clamp
diff --git a/libs/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/libs/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
new file mode 100644
index 0000000000..ff19ea6470
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
@@ -0,0 +1,197 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+#include "vpx_ports/mem.h"
+
+static void hadamard_col8x2_avx2(__m256i *in, int iter) {
+  __m256i a0 = in[0];
+  __m256i a1 = in[1];
+  __m256i a2 = in[2];
+  __m256i a3 = in[3];
+  __m256i a4 = in[4];
+  __m256i a5 = in[5];
+  __m256i a6 = in[6];
+  __m256i a7 = in[7];
+
+  __m256i b0 = _mm256_add_epi16(a0, a1);
+  __m256i b1 = _mm256_sub_epi16(a0, a1);
+  __m256i b2 = _mm256_add_epi16(a2, a3);
+  __m256i b3 = _mm256_sub_epi16(a2, a3);
+  __m256i b4 = _mm256_add_epi16(a4, a5);
+  __m256i b5 = _mm256_sub_epi16(a4, a5);
+  __m256i b6 = _mm256_add_epi16(a6, a7);
+  __m256i b7 = _mm256_sub_epi16(a6, a7);
+
+  a0 = _mm256_add_epi16(b0, b2);
+  a1 = _mm256_add_epi16(b1, b3);
+  a2 = _mm256_sub_epi16(b0, b2);
+  a3 = _mm256_sub_epi16(b1, b3);
+  a4 = _mm256_add_epi16(b4, b6);
+  a5 = _mm256_add_epi16(b5, b7);
+  a6 = _mm256_sub_epi16(b4, b6);
+  a7 = _mm256_sub_epi16(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm256_add_epi16(a0, a4);
+    b7 = _mm256_add_epi16(a1, a5);
+    b3 = _mm256_add_epi16(a2, a6);
+    b4 = _mm256_add_epi16(a3, a7);
+    b2 = _mm256_sub_epi16(a0, a4);
+    b6 = _mm256_sub_epi16(a1, a5);
+    b1 = _mm256_sub_epi16(a2, a6);
+    b5 = _mm256_sub_epi16(a3, a7);
+
+    a0 = _mm256_unpacklo_epi16(b0, b1);
+    a1 = _mm256_unpacklo_epi16(b2, b3);
+    a2 = _mm256_unpackhi_epi16(b0, b1);
+    a3 = _mm256_unpackhi_epi16(b2, b3);
+    a4 = _mm256_unpacklo_epi16(b4, b5);
+    a5 = _mm256_unpacklo_epi16(b6, b7);
+    a6 = _mm256_unpackhi_epi16(b4, b5);
+    a7 = _mm256_unpackhi_epi16(b6, b7);
+
+    b0 = _mm256_unpacklo_epi32(a0, a1);
+    b1 = _mm256_unpacklo_epi32(a4, a5);
+    b2 = _mm256_unpackhi_epi32(a0, a1);
+    b3 = _mm256_unpackhi_epi32(a4, a5);
+    b4 = _mm256_unpacklo_epi32(a2, a3);
+    b5 = _mm256_unpacklo_epi32(a6, a7);
+    b6 = _mm256_unpackhi_epi32(a2, a3);
+    b7 = _mm256_unpackhi_epi32(a6, a7);
+
+    in[0] = _mm256_unpacklo_epi64(b0, b1);
+    in[1] = _mm256_unpackhi_epi64(b0, b1);
+    in[2] = _mm256_unpacklo_epi64(b2, b3);
+    in[3] = _mm256_unpackhi_epi64(b2, b3);
+    in[4] = _mm256_unpacklo_epi64(b4, b5);
+    in[5] = _mm256_unpackhi_epi64(b4, b5);
+    in[6] = _mm256_unpacklo_epi64(b6, b7);
+    in[7] = _mm256_unpackhi_epi64(b6, b7);
+  } else {
+    in[0] = _mm256_add_epi16(a0, a4);
+    in[7] = _mm256_add_epi16(a1, a5);
+    in[3] = _mm256_add_epi16(a2, a6);
+    in[4] = _mm256_add_epi16(a3, a7);
+    in[2] = _mm256_sub_epi16(a0, a4);
+    in[6] = _mm256_sub_epi16(a1, a5);
+    in[1] = _mm256_sub_epi16(a2, a6);
+    in[5] = _mm256_sub_epi16(a3, a7);
+  }
+}
+
+static void hadamard_8x8x2_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  __m256i src[8];
+  src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
+  src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+
+  hadamard_col8x2_avx2(src, 0);
+  hadamard_col8x2_avx2(src, 1);
+
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[0], src[1], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[2], src[3], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[4], src[5], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[6], src[7], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[0], src[1], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[2], src[3], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[4], src[5], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[6], src[7], 0x31));
+}
+
+void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  int idx;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+  int16_t *t_coeff = temp_coeff;
+#else
+  int16_t *t_coeff = coeff;
+#endif
+
+  for (idx = 0; idx < 2; ++idx) {
+    int16_t const *src_ptr = src_diff + idx * 8 * src_stride;
+    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
+  }
+
+  for (idx = 0; idx < 64; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi16(b0, 1);
+    b1 = _mm256_srai_epi16(b1, 1);
+    b2 = _mm256_srai_epi16(b2, 1);
+    b3 = _mm256_srai_epi16(b3, 1);
+
+    store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+    store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
+    store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
+    store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+
+    coeff += 16;
+    t_coeff += 16;
+  }
+}
+
+int vpx_satd_avx2(const tran_low_t *coeff, int length) {
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i accum = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < length; i += 16) {
+    const __m256i src_line = load_tran_low(coeff);
+    const __m256i abs = _mm256_abs_epi16(src_line);
+    const __m256i sum = _mm256_madd_epi16(abs, one);
+    accum = _mm256_add_epi32(accum, sum);
+    coeff += 16;
+  }
+
+  {  // 32 bit horizontal add
+    const __m256i a = _mm256_srli_si256(accum, 8);
+    const __m256i b = _mm256_add_epi32(accum, a);
+    const __m256i c = _mm256_srli_epi64(b, 32);
+    const __m256i d = _mm256_add_epi32(b, c);
+    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+                                            _mm256_extractf128_si256(d, 1));
+    return _mm_cvtsi128_si32(accum_128);
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
index b0a104bad0..a235ba41df 100644
--- a/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
@@ -11,6 +11,8 @@
 #include <emmintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_ports/mem.h"
 
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
@@ -212,8 +214,8 @@ static void hadamard_col8_sse2(__m128i *in, int iter) {
   }
 }
 
-void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
-                           int16_t *coeff) {
+void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
   __m128i src[8];
   src[0] = _mm_load_si128((const __m128i *)src_diff);
   src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
@@ -227,25 +229,25 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
   hadamard_col8_sse2(src, 0);
   hadamard_col8_sse2(src, 1);
 
-  _mm_store_si128((__m128i *)coeff, src[0]);
+  store_tran_low(src[0], coeff);
   coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[1]);
+  store_tran_low(src[1], coeff);
   coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[2]);
+  store_tran_low(src[2], coeff);
   coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[3]);
+  store_tran_low(src[3], coeff);
   coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[4]);
+  store_tran_low(src[4], coeff);
   coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[5]);
+  store_tran_low(src[5], coeff);
   coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[6]);
+  store_tran_low(src[6], coeff);
   coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[7]);
+  store_tran_low(src[7], coeff);
 }
 
-void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
-                             int16_t *coeff) {
+void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
   int idx;
   for (idx = 0; idx < 4; ++idx) {
     int16_t const *src_ptr =
@@ -254,10 +256,10 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
   }
 
   for (idx = 0; idx < 64; idx += 8) {
-    __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
-    __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
-    __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
-    __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
+    __m128i coeff0 = load_tran_low(coeff);
+    __m128i coeff1 = load_tran_low(coeff + 64);
+    __m128i coeff2 = load_tran_low(coeff + 128);
+    __m128i coeff3 = load_tran_low(coeff + 192);
 
     __m128i b0 = _mm_add_epi16(coeff0, coeff1);
     __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
@@ -271,25 +273,25 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
 
     coeff0 = _mm_add_epi16(b0, b2);
     coeff1 = _mm_add_epi16(b1, b3);
-    _mm_store_si128((__m128i *)coeff, coeff0);
-    _mm_store_si128((__m128i *)(coeff + 64), coeff1);
+    store_tran_low(coeff0, coeff);
+    store_tran_low(coeff1, coeff + 64);
 
     coeff2 = _mm_sub_epi16(b0, b2);
     coeff3 = _mm_sub_epi16(b1, b3);
-    _mm_store_si128((__m128i *)(coeff + 128), coeff2);
-    _mm_store_si128((__m128i *)(coeff + 192), coeff3);
+    store_tran_low(coeff2, coeff + 128);
+    store_tran_low(coeff3, coeff + 192);
 
     coeff += 8;
   }
 }
 
-int vpx_satd_sse2(const int16_t *coeff, int length) {
+int vpx_satd_sse2(const tran_low_t *coeff, int length) {
   int i;
   const __m128i zero = _mm_setzero_si128();
   __m128i accum = zero;
 
   for (i = 0; i < length; i += 8) {
-    const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+    const __m128i src_line = load_tran_low(coeff);
     const __m128i inv = _mm_sub_epi16(zero, src_line);
     const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
     const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
diff --git a/libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c
new file mode 100644
index 0000000000..f83b26490e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c
@@ -0,0 +1,69 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  /* comp and pred must be 16 byte aligned. */
+  assert(((intptr_t)comp & 0xf) == 0);
+  assert(((intptr_t)pred & 0xf) == 0);
+  if (width > 8) {
+    int x, y;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; x += 16) {
+        const __m128i p = _mm_load_si128((const __m128i *)(pred + x));
+        const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x));
+        const __m128i avg = _mm_avg_epu8(p, r);
+        _mm_store_si128((__m128i *)(comp + x), avg);
+      }
+      comp += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else {  // width must be 4 or 8.
+    int i;
+    // Process 16 elements at a time. comp and pred have width == stride and
+    // therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are all
+    // divisible by 16 so just ref needs to be massaged when loading.
+    for (i = 0; i < width * height; i += 16) {
+      const __m128i p = _mm_load_si128((const __m128i *)pred);
+      __m128i r;
+      __m128i avg;
+      if (width == ref_stride) {
+        r = _mm_loadu_si128((const __m128i *)ref);
+        ref += 16;
+      } else if (width == 4) {
+        r = _mm_set_epi32(*(const uint32_t *)(ref + 3 * ref_stride),
+                          *(const uint32_t *)(ref + 2 * ref_stride),
+                          *(const uint32_t *)(ref + ref_stride),
+                          *(const uint32_t *)(ref));
+
+        ref += 4 * ref_stride;
+      } else {
+        const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
+        assert(width == 8);
+        r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0),
+                                          (const __m64 *)(ref + ref_stride)));
+
+        ref += 2 * ref_stride;
+      }
+      avg = _mm_avg_epu8(p, r);
+      _mm_store_si128((__m128i *)comp, avg);
+
+      pred += 16;
+      comp += 16;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm b/libs/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
index 26412e8e43..22e0a086cc 100644
--- a/libs/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
+++ b/libs/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
@@ -8,42 +8,50 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-%define private_prefix vpx
-
 %include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the hadamard transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION .text
 
 %if ARCH_X86_64
 ; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
+%macro TRANSPOSE8X8 10
+  ; stage 1
+  punpcklwd  m%9, m%1, m%2
+  punpcklwd  m%10, m%3, m%4
+  punpckhwd  m%1, m%2
+  punpckhwd  m%3, m%4
 
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
+  punpcklwd  m%2, m%5, m%6
+  punpcklwd  m%4, m%7, m%8
+  punpckhwd  m%5, m%6
+  punpckhwd  m%7, m%8
 
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
+  ; stage 2
+  punpckldq  m%6, m%9, m%10
+  punpckldq  m%8, m%1, m%3
+  punpckhdq  m%9, m%10
+  punpckhdq  m%1, m%3
 
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
+  punpckldq  m%10, m%2, m%4
+  punpckldq  m%3, m%5, m%7
+  punpckhdq  m%2, m%4
+  punpckhdq  m%5, m%7
 
-  SWAP  %2, %5
-  SWAP  %4, %7
+  ; stage 3
+  punpckhqdq  m%4, m%9, m%2  ; out3
+  punpcklqdq  m%9, m%2       ; out2
+  punpcklqdq  m%7, m%1, m%5  ; out6
+  punpckhqdq  m%1, m%5       ; out7
+
+  punpckhqdq  m%2, m%6, m%10 ; out1
+  punpcklqdq  m%6, m%10      ; out0
+  punpcklqdq  m%5, m%8, m%3  ; out4
+  punpckhqdq  m%8, m%3       ; out5
+
+  SWAP %6, %1
+  SWAP %3, %9
+  SWAP %8, %6
 %endmacro
 
 %macro HMD8_1D 0
@@ -87,8 +95,9 @@ SECTION .text
   SWAP               7, 9
 %endmacro
 
+
 INIT_XMM ssse3
-cglobal hadamard_8x8, 3, 5, 10, input, stride, output
+cglobal hadamard_8x8, 3, 5, 11, input, stride, output
   lea                r3, [2 * strideq]
   lea                r4, [4 * strideq]
 
@@ -105,17 +114,17 @@ cglobal hadamard_8x8, 3, 5, 10, input, stride, output
   mova               m7, [inputq + r3]
 
   HMD8_1D
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
   HMD8_1D
 
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
+  STORE_TRAN_LOW 0, outputq,  0, 8, 9
+  STORE_TRAN_LOW 1, outputq,  8, 8, 9
+  STORE_TRAN_LOW 2, outputq, 16, 8, 9
+  STORE_TRAN_LOW 3, outputq, 24, 8, 9
+  STORE_TRAN_LOW 4, outputq, 32, 8, 9
+  STORE_TRAN_LOW 5, outputq, 40, 8, 9
+  STORE_TRAN_LOW 6, outputq, 48, 8, 9
+  STORE_TRAN_LOW 7, outputq, 56, 8, 9
 
   RET
 %endif
diff --git a/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
new file mode 100644
index 0000000000..3552c07cd3
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
+#define VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
+
+#include <immintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Load 16 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE __m256i load_tran_low(const tran_low_t *a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m256i a_low = _mm256_loadu_si256((const __m256i *)a);
+  const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8));
+  return _mm256_packs_epi32(a_low, a_high);
+#else
+  return _mm256_loadu_si256((const __m256i *)a);
+#endif
+}
+
+static INLINE void store_tran_low(__m256i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_hi = _mm256_mulhi_epi16(a, one);
+  const __m256i a_lo = _mm256_mullo_epi16(a, one);
+  const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi);
+  const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi);
+  _mm256_storeu_si256((__m256i *)b, a_1);
+  _mm256_storeu_si256((__m256i *)(b + 8), a_2);
+#else
+  _mm256_storeu_si256((__m256i *)b, a);
+#endif
+}
+#endif  // VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm
new file mode 100644
index 0000000000..aacf71f7ac
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm
@@ -0,0 +1,90 @@
+;
+;  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm.
+; vpx_config.asm is not guarded so can not be included twice. Because this will
+; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be
+; included after those files.
+
+; Increment register by sizeof() tran_low_t * 8.
+%macro INCREMENT_TRAN_LOW 1
+%if CONFIG_VP9_HIGHBITDEPTH
+  add %1, 32
+%else
+  add %1, 16
+%endif
+%endmacro
+
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea %1, [%1 + %2 * 4]
+%else
+  lea %1, [%1 + %2 * 2]
+%endif
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bytes.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova     m%1, [%2 + (%3) * 4]
+  packssdw m%1, [%2 + (%3) * 4 + 16]
+%else
+  mova     m%1, [%2 + (%3) * 2]
+%endif
+%endmacro
+
+; Store m%1 to %2 + %3.
+; %3 is the offset in elements, not bytes.
+; If 5 arguments are provided then m%1 is corrupted.
+; If 6 arguments are provided then m%1 is preserved.
+; If tran_low_t is 16 bits (low bit depth configuration) then store the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign
+; extend the values first.
+; Uses m%4-m%6 as scratch registers for high bit depth.
+%macro STORE_TRAN_LOW 5-6
+%if CONFIG_VP9_HIGHBITDEPTH
+  pxor                      m%4, m%4
+  mova                      m%5, m%1
+  %if %0 == 6
+  mova                      m%6, m%1
+  %endif
+  pcmpgtw                   m%4, m%1
+  punpcklwd                 m%5, m%4
+  %if %0 == 5
+  punpckhwd                 m%1, m%4
+  %else
+  punpckhwd                 m%6, m%4
+  %endif
+  mova     [%2 + (%3) * 4 +  0], m%5
+  %if %0 == 5
+  mova     [%2 + (%3) * 4 + 16], m%1
+  %else
+  mova     [%2 + (%3) * 4 + 16], m%6
+  %endif
+%else
+  mova          [%2 + (%3) * 2], m%1
+%endif
+%endmacro
+
+; Store zeros (in m%1) to %2 + %3.
+; %3 is the offset in elements, not bytes.
+%macro STORE_ZERO_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova     [%2 + (%3) * 4 +  0], m%1
+  mova     [%2 + (%3) * 4 + 16], m%1
+%else
+  mova          [%2 + (%3) * 2], m%1
+%endif
+%endmacro
diff --git a/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h
new file mode 100644
index 0000000000..5d1d779572
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h
@@ -0,0 +1,56 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
+#define VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
+
+#include <xmmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE __m128i load_tran_low(const tran_low_t *a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i a_low = _mm_load_si128((const __m128i *)a);
+  return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+#else
+  return _mm_load_si128((const __m128i *)a);
+#endif
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_hi = _mm_mulhi_epi16(a, one);
+  const __m128i a_lo = _mm_mullo_epi16(a, one);
+  const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+  const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+  _mm_store_si128((__m128i *)(b), a_1);
+  _mm_store_si128((__m128i *)(b + 4), a_2);
+#else
+  _mm_store_si128((__m128i *)(b), a);
+#endif
+}
+
+// Zero fill 8 positions in the output buffer.
+static INLINE void store_zero_tran_low(tran_low_t *a) {
+  const __m128i zero = _mm_setzero_si128();
+#if CONFIG_VP9_HIGHBITDEPTH
+  _mm_store_si128((__m128i *)(a), zero);
+  _mm_store_si128((__m128i *)(a + 4), zero);
+#else
+  _mm_store_si128((__m128i *)(a), zero);
+#endif
+}
+#endif  // VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/convolve.h b/libs/libvpx/vpx_dsp/x86/convolve.h
index 2a0516cdc1..68d7589d45 100644
--- a/libs/libvpx/vpx_dsp/x86/convolve.h
+++ b/libs/libvpx/vpx_dsp/x86/convolve.h
@@ -20,11 +20,16 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
                                 uint32_t output_height, const int16_t *filter);
 
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)         \
+#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)         \
   void vpx_convolve8_##name##_##opt(                                         \
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
+      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,    \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
+    const int16_t *filter = filter_kernel[offset];                           \
+    (void)x0_q4;                                                             \
+    (void)x_step_q4;                                                         \
+    (void)y0_q4;                                                             \
+    (void)y_step_q4;                                                         \
     assert(filter[3] != 128);                                                \
     assert(step_q4 == 16);                                                   \
     if (filter[0] | filter[1] | filter[2]) {                                 \
@@ -60,32 +65,36 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     }                                                                        \
   }
 
-#define FUN_CONV_2D(avg, opt)                                                 \
-  void vpx_convolve8_##avg##opt(                                              \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                 \
-    assert(filter_x[3] != 128);                                               \
-    assert(filter_y[3] != 128);                                               \
-    assert(w <= 64);                                                          \
-    assert(h <= 64);                                                          \
-    assert(x_step_q4 == 16);                                                  \
-    assert(y_step_q4 == 16);                                                  \
-    if (filter_x[0] | filter_x[1] | filter_x[2]) {                            \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                          \
-      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
-                                filter_x, x_step_q4, filter_y, y_step_q4, w,  \
-                                h + 7);                                       \
-      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,   \
-                                      filter_x, x_step_q4, filter_y,          \
-                                      y_step_q4, w, h);                       \
-    } else {                                                                  \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]);                          \
-      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter_x,        \
-                                x_step_q4, filter_y, y_step_q4, w, h + 1);    \
-      vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter_x,  \
-                                      x_step_q4, filter_y, y_step_q4, w, h);  \
-    }                                                                         \
+#define FUN_CONV_2D(avg, opt)                                                  \
+  void vpx_convolve8_##avg##opt(                                               \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                  \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {                 \
+    const int16_t *filter_x = filter[x0_q4];                                   \
+    const int16_t *filter_y = filter[y0_q4];                                   \
+    (void)filter_y;                                                            \
+    assert(filter_x[3] != 128);                                                \
+    assert(filter_y[3] != 128);                                                \
+    assert(w <= 64);                                                           \
+    assert(h <= 64);                                                           \
+    assert(x_step_q4 == 16);                                                   \
+    assert(y_step_q4 == 16);                                                   \
+    if (filter_x[0] | filter_x[1] | filter_x[2]) {                             \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                           \
+      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64,  \
+                                filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
+                                h + 7);                                        \
+      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,    \
+                                      filter, x0_q4, x_step_q4, y0_q4,         \
+                                      y_step_q4, w, h);                        \
+    } else {                                                                   \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]);                           \
+      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4,    \
+                                x_step_q4, y0_q4, y_step_q4, w, h + 1);        \
+      vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter,     \
+                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w,   \
+                                      h);                                      \
+    }                                                                          \
   }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -97,97 +106,97 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                        unsigned int output_height,
                                        const int16_t *filter, int bd);
 
-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vpx_highbd_convolve8_##name##_##opt(                               \
-      const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,           \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,       \
-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {     \
-    if (step_q4 == 16 && filter[3] != 128) {                              \
-      uint16_t *src = CONVERT_TO_SHORTPTR(src8);                          \
-      uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                          \
-      if (filter[0] | filter[1] | filter[2]) {                            \
-        while (w >= 16) {                                                 \
-          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 16;                                                      \
-          dst += 16;                                                      \
-          w -= 16;                                                        \
-        }                                                                 \
-        while (w >= 8) {                                                  \
-          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 8;                                                       \
-          dst += 8;                                                       \
-          w -= 8;                                                         \
-        }                                                                 \
-        while (w >= 4) {                                                  \
-          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 4;                                                       \
-          dst += 4;                                                       \
-          w -= 4;                                                         \
-        }                                                                 \
-      } else {                                                            \
-        while (w >= 16) {                                                 \
-          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 16;                                                      \
-          dst += 16;                                                      \
-          w -= 16;                                                        \
-        }                                                                 \
-        while (w >= 8) {                                                  \
-          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                 \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 8;                                                       \
-          dst += 8;                                                       \
-          w -= 8;                                                         \
-        }                                                                 \
-        while (w >= 4) {                                                  \
-          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                 \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 4;                                                       \
-          dst += 4;                                                       \
-          w -= 4;                                                         \
-        }                                                                 \
-      }                                                                   \
-    }                                                                     \
-    if (w) {                                                              \
-      vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
-                                      filter_x, x_step_q4, filter_y,      \
-                                      y_step_q4, w, h, bd);               \
-    }                                                                     \
+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)     \
+  void vpx_highbd_convolve8_##name##_##opt(                                   \
+      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,               \
+      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,     \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
+    const int16_t *filter = filter_kernel[offset];                            \
+    if (step_q4 == 16 && filter[3] != 128) {                                  \
+      if (filter[0] | filter[1] | filter[2]) {                                \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
+      } else {                                                                \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
+              src, src_stride, dst, dst_stride, h, filter, bd);               \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
+              src, src_stride, dst, dst_stride, h, filter, bd);               \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
+              src, src_stride, dst, dst_stride, h, filter, bd);               \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    if (w) {                                                                  \
+      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \
+                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \
+                                      y_step_q4, w, h, bd);                   \
+    }                                                                         \
   }
 
-#define HIGH_FUN_CONV_2D(avg, opt)                                            \
-  void vpx_highbd_convolve8_##avg##opt(                                       \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {         \
-    assert(w <= 64);                                                          \
-    assert(h <= 64);                                                          \
-    if (x_step_q4 == 16 && y_step_q4 == 16) {                                 \
-      if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) {  \
-        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                       \
-        vpx_highbd_convolve8_horiz_##opt(                                     \
-            src - 3 * src_stride, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, \
-            filter_x, x_step_q4, filter_y, y_step_q4, w, h + 7, bd);          \
-        vpx_highbd_convolve8_##avg##vert_##opt(                               \
-            CONVERT_TO_BYTEPTR(fdata2) + 192, 64, dst, dst_stride, filter_x,  \
-            x_step_q4, filter_y, y_step_q4, w, h, bd);                        \
-      } else {                                                                \
-        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                       \
-        vpx_highbd_convolve8_horiz_##opt(                                     \
-            src, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, filter_x,        \
-            x_step_q4, filter_y, y_step_q4, w, h + 1, bd);                    \
-        vpx_highbd_convolve8_##avg##vert_##opt(                               \
-            CONVERT_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x,        \
-            x_step_q4, filter_y, y_step_q4, w, h, bd);                        \
-      }                                                                       \
-    } else {                                                                  \
-      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride,         \
-                                    filter_x, x_step_q4, filter_y, y_step_q4, \
-                                    w, h, bd);                                \
-    }                                                                         \
+#define HIGH_FUN_CONV_2D(avg, opt)                                             \
+  void vpx_highbd_convolve8_##avg##opt(                                        \
+      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,                \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {         \
+    const int16_t *filter_x = filter[x0_q4];                                   \
+    assert(w <= 64);                                                           \
+    assert(h <= 64);                                                           \
+    if (x_step_q4 == 16 && y_step_q4 == 16) {                                  \
+      if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) {   \
+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                        \
+        vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,     \
+                                         fdata2, 64, filter, x0_q4, x_step_q4, \
+                                         y0_q4, y_step_q4, w, h + 7, bd);      \
+        vpx_highbd_convolve8_##avg##vert_##opt(                                \
+            fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4,       \
+            y0_q4, y_step_q4, w, h, bd);                                       \
+      } else {                                                                 \
+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                        \
+        vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter,  \
+                                         x0_q4, x_step_q4, y0_q4, y_step_q4,   \
+                                         w, h + 1, bd);                        \
+        vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride,    \
+                                               filter, x0_q4, x_step_q4,       \
+                                               y0_q4, y_step_q4, w, h, bd);    \
+      }                                                                        \
+    } else {                                                                   \
+      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter,  \
+                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,  \
+                                    bd);                                       \
+    }                                                                          \
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
diff --git a/libs/libvpx/vpx_dsp/x86/convolve_avx2.h b/libs/libvpx/vpx_dsp/x86/convolve_avx2.h
new file mode 100644
index 0000000000..bc96b738f4
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/convolve_avx2.h
@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_CONVOLVE_AVX2_H_
+#define VPX_DSP_X86_CONVOLVE_AVX2_H_
+
+#include <immintrin.h>  // AVX2
+
+#include "./vpx_config.h"
+
+#if defined(__clang__)
+#if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
+    (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
+    (defined(__APPLE__) && defined(__apple_build_version__) && \
+     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
+      (__clang_major__ == 5 && __clang_minor__ == 0)))
+#define MM256_BROADCASTSI128_SI256(x) \
+  _mm_broadcastsi128_si256((__m128i const *)&(x))
+#else  // clang > 3.3, and not 5.0 on macosx.
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // clang <= 3.3
+#elif defined(__GNUC__)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#define MM256_BROADCASTSI128_SI256(x) \
+  _mm_broadcastsi128_si256((__m128i const *)&(x))
+#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+#else  // gcc > 4.7
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // gcc <= 4.6
+#else   // !(gcc || clang)
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // __clang__
+
+static INLINE void shuffle_filter_avx2(const int16_t *const filter,
+                                       __m256i *const f) {
+  const __m256i f_values =
+      MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter));
+  // pack and duplicate the filter values
+  f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u));
+  f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u));
+  f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u));
+  f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
+                                        const __m256i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m256i k_64 = _mm256_set1_epi16(1 << 6);
+  const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]);
+  const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
+  const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
+  const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
+  __m256i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm256_add_epi16(x0, x2);
+  sum2 = _mm256_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm256_add_epi16(sum1, k_64);
+  sum1 = _mm256_adds_epi16(sum1, sum2);
+  // round and shift by 7 bit each 16 bit
+  sum1 = _mm256_srai_epi16(sum1, 7);
+  return sum1;
+}
+
+static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
+                                       const __m256i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]),
+                                       _mm256_castsi256_si128(f[0]));
+  const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]),
+                                       _mm256_castsi256_si128(f[1]));
+  const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]),
+                                       _mm256_castsi256_si128(f[2]));
+  const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
+                                       _mm256_castsi256_si128(f[3]));
+  __m128i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm_add_epi16(x0, x2);
+  sum2 = _mm_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm_add_epi16(sum1, k_64);
+  sum1 = _mm_adds_epi16(sum1, sum2);
+  // shift by 7 bit each 16 bit
+  sum1 = _mm_srai_epi16(sum1, 7);
+  return sum1;
+}
+
+#undef MM256_BROADCASTSI128_SI256
+
+#endif  // VPX_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/convolve_ssse3.h b/libs/libvpx/vpx_dsp/x86/convolve_ssse3.h
new file mode 100644
index 0000000000..e5d452f99e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/convolve_ssse3.h
@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_CONVOLVE_SSSE3_H_
+#define VPX_DSP_X86_CONVOLVE_SSSE3_H_
+
+#include <assert.h>
+#include <tmmintrin.h>  // SSSE3
+
+#include "./vpx_config.h"
+
+static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
+                                        __m128i *const f) {
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+}
+
+static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
+                                            __m128i *const f) {
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  // It utilizes the fact that the high byte of filter[3] is always 0 to clean
+  // half of f[0] and f[4].
+  assert(filter[3] >= 0 && filter[3] < 256);
+  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
+  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
+  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
+  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
+  f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
+}
+
+static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
+                                        const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  __m128i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm_add_epi16(x0, x2);
+  sum2 = _mm_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm_add_epi16(sum1, k_64);
+  sum1 = _mm_adds_epi16(sum1, sum2);
+  // shift by 7 bit each 16 bit
+  sum1 = _mm_srai_epi16(sum1, 7);
+  return sum1;
+}
+
+static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
+                                                    const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  // compensate the subtracted 64 in f[1]. x4 is always non negative.
+  const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
+  // add and saturate the results together
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, x1);
+  temp = _mm_adds_epi16(temp, x2);
+  temp = _mm_adds_epi16(temp, x4);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_adds_epi16(temp, k_64);
+  temp = _mm_srai_epi16(temp, 7);
+  return temp;
+}
+
+static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
+                                                   const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
+  // compensate the subtracted 64 in f[2]. x5 is always non negative.
+  const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
+  __m128i temp;
+
+  // add and saturate the results together
+  temp = _mm_adds_epi16(x0, x1);
+  temp = _mm_adds_epi16(temp, x2);
+  temp = _mm_adds_epi16(temp, x3);
+  temp = _mm_adds_epi16(temp, x4);
+  temp = _mm_adds_epi16(temp, x5);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_adds_epi16(temp, k_64);
+  temp = _mm_srai_epi16(temp, 7);
+  return temp;
+}
+
+#endif  // VPX_DSP_X86_CONVOLVE_SSSE3_H_
diff --git a/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm b/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm
index 6df360df44..97cb43b671 100644
--- a/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm
@@ -78,11 +78,13 @@
 %endmacro
 
 %macro UPDATE_FLIMIT 0
-        movdqa      xmm2,       XMMWORD PTR [rbx]
-        movdqa      [rsp],      xmm2
+        movdqu      xmm2,       XMMWORD PTR [rbx]
+        movdqu      [rsp],      xmm2
         add         rbx,        16
 %endmacro
 
+SECTION .text
+
 ;void vpx_post_proc_down_and_across_mb_row_sse2
 ;(
 ;    unsigned char *src_ptr,
@@ -230,11 +232,11 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2):
     ret
 %undef flimit
 
-;void vpx_mbpost_proc_down_xmm(unsigned char *dst,
-;                            int pitch, int rows, int cols,int flimit)
+;void vpx_mbpost_proc_down_sse2(unsigned char *dst,
+;                               int pitch, int rows, int cols,int flimit)
 extern sym(vpx_rv)
-global sym(vpx_mbpost_proc_down_xmm) PRIVATE
-sym(vpx_mbpost_proc_down_xmm):
+global sym(vpx_mbpost_proc_down_sse2) PRIVATE
+sym(vpx_mbpost_proc_down_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -462,10 +464,10 @@ sym(vpx_mbpost_proc_down_xmm):
 %undef flimit4
 
 
-;void vpx_mbpost_proc_across_ip_xmm(unsigned char *src,
-;                                int pitch, int rows, int cols,int flimit)
-global sym(vpx_mbpost_proc_across_ip_xmm) PRIVATE
-sym(vpx_mbpost_proc_across_ip_xmm):
+;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
+;                                    int pitch, int rows, int cols,int flimit)
+global sym(vpx_mbpost_proc_across_ip_sse2) PRIVATE
+sym(vpx_mbpost_proc_across_ip_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 39d3a3f59c..132e065239 100644
--- a/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -51,7 +51,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
   const __m256i k__cospi_p16_m16 =
       pair256_set_epi16(+cospi_16_64, -cospi_16_64);
   const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
index 3744333909..32b9bd2813 100644
--- a/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -63,7 +63,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
index 743e55e635..f9abaecf28 100644
--- a/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
@@ -261,7 +261,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -582,7 +582,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/libs/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
index 78a1dbb24f..32824a03a3 100644
--- a/libs/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ b/libs/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -10,10 +10,6 @@
 
 %include "third_party/x86inc/x86inc.asm"
 
-; This file provides SSSE3 version of the forward transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
-
 SECTION_RODATA
 
 pw_11585x2: times 8 dw 23170
@@ -32,107 +28,11 @@ TRANSFORM_COEFFS  9102,  13623
 SECTION .text
 
 %if ARCH_X86_64
-%macro SUM_SUB 3
-  psubw  m%3, m%1, m%2
-  paddw  m%1, m%2
-  SWAP    %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
-  pmaddwd            m%1, m%3, %5
-  pmaddwd            m%2, m%3, %6
-  paddd              m%1,  %4
-  paddd              m%2,  %4
-  psrad              m%1,  14
-  psrad              m%2,  14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-; 1D forward 8x8 DCT transform
-%macro FDCT8_1D 1
-  SUM_SUB            0,  7,  9
-  SUM_SUB            1,  6,  9
-  SUM_SUB            2,  5,  9
-  SUM_SUB            3,  4,  9
-
-  SUM_SUB            0,  3,  9
-  SUM_SUB            1,  2,  9
-  SUM_SUB            6,  5,  9
-%if %1 == 0
-  SUM_SUB            0,  1,  9
-%endif
-
-  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
-
-  pmulhrsw           m6, m12
-  pmulhrsw           m5, m12
-%if %1 == 0
-  pmulhrsw           m0, m12
-  pmulhrsw           m1, m12
-%else
-  BUTTERFLY_4X       1,  0,  11585, 11585,  m8,  9,  10
-  SWAP               0,  1
-%endif
-
-  SUM_SUB            4,  5,  9
-  SUM_SUB            7,  6,  9
-  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10
-  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10
-  SWAP               1,  4
-  SWAP               3,  6
-%endmacro
-
-%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
-  psraw              m%3, m%1, 15
-  psraw              m%4, m%2, 15
-  psubw              m%1, m%3
-  psubw              m%2, m%4
-  psraw              m%1, 1
-  psraw              m%2, 1
-%endmacro
-
 INIT_XMM ssse3
 cglobal fdct8x8, 3, 5, 13, input, output, stride
 
-  mova               m8, [pd_8192]
-  mova              m12, [pw_11585x2]
-  pxor              m11, m11
+  mova               m8, [GLOBAL(pd_8192)]
+  mova              m12, [GLOBAL(pw_11585x2)]
 
   lea                r3, [2 * strideq]
   lea                r4, [4 * strideq]
@@ -159,25 +59,303 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
   psllw              m7, 2
 
   ; column transform
-  FDCT8_1D  0
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+  ; stage 1
+  paddw m10, m0, m7
+  psubw m0, m7
 
-  FDCT8_1D  1
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+  paddw m9, m1, m6
+  psubw m1, m6
 
-  DIVIDE_ROUND_2X   0, 1, 9, 10
-  DIVIDE_ROUND_2X   2, 3, 9, 10
-  DIVIDE_ROUND_2X   4, 5, 9, 10
-  DIVIDE_ROUND_2X   6, 7, 9, 10
+  paddw m7, m2, m5
+  psubw m2, m5
 
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
+  paddw m6, m3, m4
+  psubw m3, m4
+
+  ; stage 2
+  paddw m5, m9, m7
+  psubw m9, m7
+
+  paddw m4, m10, m6
+  psubw m10, m6
+
+  paddw m7, m1, m2
+  psubw m1, m2
+
+  ; stage 3
+  paddw m6, m4, m5
+  psubw m4, m5
+
+  pmulhrsw m1, m12
+  pmulhrsw m7, m12
+
+  ; sin(pi / 8), cos(pi / 8)
+  punpcklwd m2, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
+  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
+  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
+  paddd m5, m8
+  paddd m2, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m2, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m5, m9
+  packssdw m2, m10
+
+  pmulhrsw m6, m12
+  pmulhrsw m4, m12
+
+  paddw m9, m3, m1
+  psubw m3, m1
+
+  paddw m10, m0, m7
+  psubw m0, m7
+
+  ; stage 4
+  ; sin(pi / 16), cos(pi / 16)
+  punpcklwd m1, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
+  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
+  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m1, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m7, 14
+  psrad m1, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m7, m9
+  packssdw m1, m10
+
+  ; sin(3 * pi / 16), cos(3 * pi / 16)
+  punpcklwd m11, m0, m3
+  punpckhwd m0, m3
+  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
+  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
+  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
+  paddd m9, m8
+  paddd m11, m8
+  paddd m3, m8
+  paddd m0, m8
+  psrad m9, 14
+  psrad m11, 14
+  psrad m3, 14
+  psrad m0, 14
+  packssdw m9, m3
+  packssdw m11, m0
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m6, m7
+  punpcklwd m3, m5, m11
+  punpckhwd m6, m7
+  punpckhwd m5, m11
+  punpcklwd m7, m4, m9
+  punpcklwd m10, m2, m1
+  punpckhwd m4, m9
+  punpckhwd m2, m1
+
+  ; stage 2
+  punpckldq m9, m0, m3
+  punpckldq m1, m6, m5
+  punpckhdq m0, m3
+  punpckhdq m6, m5
+  punpckldq m3, m7, m10
+  punpckldq m5, m4, m2
+  punpckhdq m7, m10
+  punpckhdq m4, m2
+
+  ; stage 3
+  punpcklqdq m10, m9, m3
+  punpckhqdq m9, m3
+  punpcklqdq m2, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m3, m1, m5
+  punpckhqdq m1, m5
+  punpcklqdq m7, m6, m4
+  punpckhqdq m6, m4
+
+  ; row transform
+  ; stage 1
+  paddw m5, m10, m6
+  psubw m10, m6
+
+  paddw m4, m9, m7
+  psubw m9, m7
+
+  paddw m6, m2, m1
+  psubw m2, m1
+
+  paddw m7, m0, m3
+  psubw m0, m3
+
+  ;stage 2
+  paddw m1, m5, m7
+  psubw m5, m7
+
+  paddw m3, m4, m6
+  psubw m4, m6
+
+  paddw m7, m9, m2
+  psubw m9, m2
+
+  ; stage 3
+  punpcklwd m6, m1, m3
+  punpckhwd m1, m3
+  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
+  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
+  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
+  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
+  paddd m2, m8
+  paddd m6, m8
+  paddd m3, m8
+  paddd m1, m8
+  psrad m2, 14
+  psrad m6, 14
+  psrad m3, 14
+  psrad m1, 14
+  packssdw m2, m3
+  packssdw m6, m1
+
+  pmulhrsw m7, m12
+  pmulhrsw m9, m12
+
+  punpcklwd m3, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
+  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
+  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
+  paddd m1, m8
+  paddd m3, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m1, 14
+  psrad m3, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m1, m4
+  packssdw m3, m5
+
+  paddw m4, m0, m9
+  psubw m0, m9
+
+  paddw m5, m10, m7
+  psubw m10, m7
+
+  ; stage 4
+  punpcklwd m9, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
+  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
+  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m9, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m7, 14
+  psrad m9, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m7, m4
+  packssdw m9, m5
+
+  punpcklwd m4, m10, m0
+  punpckhwd m10, m0
+  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
+  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
+  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
+  paddd m5, m8
+  paddd m4, m8
+  paddd m0, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m4, 14
+  psrad m0, 14
+  psrad m10, 14
+  packssdw m5, m0
+  packssdw m4, m10
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m2, m7
+  punpcklwd m10, m1, m4
+  punpckhwd m2, m7
+  punpckhwd m1, m4
+  punpcklwd m7, m6, m5
+  punpcklwd m4, m3, m9
+  punpckhwd m6, m5
+  punpckhwd m3, m9
+
+  ; stage 2
+  punpckldq m5, m0, m10
+  punpckldq m9, m2, m1
+  punpckhdq m0, m10
+  punpckhdq m2, m1
+  punpckldq m10, m7, m4
+  punpckldq m1, m6, m3
+  punpckhdq m7, m4
+  punpckhdq m6, m3
+
+  ; stage 3
+  punpcklqdq m4, m5, m10
+  punpckhqdq m5, m10
+  punpcklqdq m3, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m10, m9, m1
+  punpckhqdq m9, m1
+  punpcklqdq m7, m2, m6
+  punpckhqdq m2, m6
+
+  psraw m1, m4, 15
+  psraw m6, m5, 15
+  psraw m8, m3, 15
+  psraw m11, m0, 15
+
+  psubw m4, m1
+  psubw m5, m6
+  psubw m3, m8
+  psubw m0, m11
+
+  psraw m4, 1
+  psraw m5, 1
+  psraw m3, 1
+  psraw m0, 1
+
+  psraw m1, m10, 15
+  psraw m6, m9, 15
+  psraw m8, m7, 15
+  psraw m11, m2, 15
+
+  psubw m10, m1
+  psubw m9, m6
+  psubw m7, m8
+  psubw m2, m11
+
+  psraw m10, 1
+  psraw m9, 1
+  psraw m7, 1
+  psraw m2, 1
+
+  mova              [outputq +   0], m4
+  mova              [outputq +  16], m5
+  mova              [outputq +  32], m3
+  mova              [outputq +  48], m0
+  mova              [outputq +  64], m10
+  mova              [outputq +  80], m9
+  mova              [outputq +  96], m7
+  mova              [outputq + 112], m2
 
   RET
 %endif
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/libs/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
new file mode 100644
index 0000000000..7e75d5d10c
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -0,0 +1,1108 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
+
+// -----------------------------------------------------------------------------
+// Copy and average
+
+void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int width, int h, int bd) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+  (void)bd;
+
+  assert(width % 4 == 0);
+  if (width > 32) {  // width = 64
+    do {
+      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
+      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (width > 16) {  // width = 32
+    do {
+      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (width > 8) {  // width = 16
+    __m256i p0, p1;
+    do {
+      p0 = _mm256_loadu_si256((const __m256i *)src);
+      src += src_stride;
+      p1 = _mm256_loadu_si256((const __m256i *)src);
+      src += src_stride;
+
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (width > 4) {  // width = 8
+    __m128i p0, p1;
+    do {
+      p0 = _mm_loadu_si128((const __m128i *)src);
+      src += src_stride;
+      p1 = _mm_loadu_si128((const __m128i *)src);
+      src += src_stride;
+
+      _mm_storeu_si128((__m128i *)dst, p0);
+      dst += dst_stride;
+      _mm_storeu_si128((__m128i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {  // width = 4
+    __m128i p0, p1;
+    do {
+      p0 = _mm_loadl_epi64((const __m128i *)src);
+      src += src_stride;
+      p1 = _mm_loadl_epi64((const __m128i *)src);
+      src += src_stride;
+
+      _mm_storel_epi64((__m128i *)dst, p0);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
+void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
+                                  uint16_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int width, int h, int bd) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+  (void)bd;
+
+  assert(width % 4 == 0);
+  if (width > 32) {  // width = 64
+    __m256i p0, p1, p2, p3, u0, u1, u2, u3;
+    do {
+      p0 = _mm256_loadu_si256((const __m256i *)src);
+      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+      p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+      src += src_stride;
+      u0 = _mm256_loadu_si256((const __m256i *)dst);
+      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
+      u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
+      u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
+      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
+      _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
+      _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (width > 16) {  // width = 32
+    __m256i p0, p1, u0, u1;
+    do {
+      p0 = _mm256_loadu_si256((const __m256i *)src);
+      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      src += src_stride;
+      u0 = _mm256_loadu_si256((const __m256i *)dst);
+      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
+      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (width > 8) {  // width = 16
+    __m256i p0, p1, u0, u1;
+    do {
+      p0 = _mm256_loadu_si256((const __m256i *)src);
+      p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
+      src += src_stride << 1;
+      u0 = _mm256_loadu_si256((const __m256i *)dst);
+      u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
+
+      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+      _mm256_storeu_si256((__m256i *)(dst + dst_stride),
+                          _mm256_avg_epu16(p1, u1));
+      dst += dst_stride << 1;
+      h -= 2;
+    } while (h > 0);
+  } else if (width > 4) {  // width = 8
+    __m128i p0, p1, u0, u1;
+    do {
+      p0 = _mm_loadu_si128((const __m128i *)src);
+      p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+      src += src_stride << 1;
+      u0 = _mm_loadu_si128((const __m128i *)dst);
+      u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
+
+      _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
+      _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
+      dst += dst_stride << 1;
+      h -= 2;
+    } while (h > 0);
+  } else {  // width = 4
+    __m128i p0, p1, u0, u1;
+    do {
+      p0 = _mm_loadl_epi64((const __m128i *)src);
+      p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
+      src += src_stride << 1;
+      u0 = _mm_loadl_epi64((const __m128i *)dst);
+      u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
+
+      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
+      dst += dst_stride << 1;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Horizontal and vertical filtering
+
+#define CONV8_ROUNDING_BITS (7)
+
+static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
+                                              8, 9, 10, 11, 10, 11, 12, 13,
+                                              4, 5, 6,  7,  6,  7,  8,  9,
+                                              8, 9, 10, 11, 10, 11, 12, 13 };
+
+static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
+                                              10, 11, 12, 13, 12, 13, 14, 15,
+                                              6,  7,  8,  9,  8,  9,  10, 11,
+                                              10, 11, 12, 13, 12, 13, 14, 15 };
+
+static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
+
+// -----------------------------------------------------------------------------
+// Horizontal Filtering
+
+static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
+  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
+  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
+
+  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
+  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
+  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
+  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
+}
+
+// Note:
+//  Shared by 8x2 and 16x1 block
+static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
+                                  __m256i *x /*x[8]*/) {
+  __m256i pp[8];
+  pack_pixels(s0, pp);
+  pack_pixels(s1, &pp[4]);
+  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
+  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
+  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
+  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
+  x[4] = x[2];
+  x[5] = x[3];
+  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
+  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
+}
+
+static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
+  __m256i pp[8];
+  __m256i s0;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  pack_pixels(&s0, pp);
+  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
+  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
+  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
+  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
+}
+
+static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
+                                   __m256i *x) {
+  __m256i s0, s1;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
+  pack_16_pixels(&s0, &s1, x);
+}
+
+static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
+  __m256i s0, s1;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+  pack_16_pixels(&s0, &s1, x);
+}
+
+// Note:
+//  Shared by horizontal and vertical filtering
+static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+  const __m256i p0 = _mm256_set1_epi32(0x03020100);
+  const __m256i p1 = _mm256_set1_epi32(0x07060504);
+  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
+  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
+  f[0] = _mm256_shuffle_epi8(hh, p0);
+  f[1] = _mm256_shuffle_epi8(hh, p1);
+  f[2] = _mm256_shuffle_epi8(hh, p2);
+  f[3] = _mm256_shuffle_epi8(hh, p3);
+}
+
+static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
+                                     const __m256i *fil /*fil[4]*/,
+                                     __m256i *y) {
+  __m256i a, a0, a1;
+
+  a0 = _mm256_madd_epi16(fil[0], sig[0]);
+  a1 = _mm256_madd_epi16(fil[3], sig[3]);
+  a = _mm256_add_epi32(a0, a1);
+
+  a0 = _mm256_madd_epi16(fil[1], sig[1]);
+  a1 = _mm256_madd_epi16(fil[2], sig[2]);
+
+  {
+    const __m256i min = _mm256_min_epi32(a0, a1);
+    a = _mm256_add_epi32(a, min);
+  }
+  {
+    const __m256i max = _mm256_max_epi32(a0, a1);
+    a = _mm256_add_epi32(a, max);
+  }
+  {
+    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+    a = _mm256_add_epi32(a, rounding);
+    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
+  }
+}
+
+static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
+                                    uint16_t *dst) {
+  const __m128i a0 = _mm256_castsi256_si128(*y);
+  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
+  __m128i res = _mm_packus_epi32(a0, a1);
+  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+  _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
+                                    const __m256i *mask, uint16_t *dst,
+                                    ptrdiff_t pitch) {
+  __m256i a = _mm256_packus_epi32(*y0, *y1);
+  a = _mm256_min_epi16(a, *mask);
+  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
+                                     const __m256i *mask, uint16_t *dst) {
+  __m256i a = _mm256_packus_epi32(*y0, *y1);
+  a = _mm256_min_epi16(a, *mask);
+  _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static void vpx_highbd_filter_block1d8_h8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[8], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  src_ptr -= 3;
+  do {
+    pack_8x2_pixels(src_ptr, src_pitch, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    filter_8x1_pixels(&signal[4], ff, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    height -= 2;
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+  } while (height > 1);
+
+  if (height > 0) {
+    pack_8x1_pixels(src_ptr, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    store_8x1_pixels(&res0, &max, dst_ptr);
+  }
+}
+
+static void vpx_highbd_filter_block1d16_h8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[8], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  src_ptr -= 3;
+  do {
+    pack_16x1_pixels(src_ptr, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    filter_8x1_pixels(&signal[4], ff, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+    height -= 1;
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+  } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap horizontal filtering
+
+static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+  const __m256i p = _mm256_set1_epi32(0x09080706);
+  f[0] = _mm256_shuffle_epi8(hh, p);
+}
+
+// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
+// the difference is s0/s1 specifies first and second rows or,
+// first 16 samples and 8-sample shifted 16 samples
+static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
+                                     __m256i *sig) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
+  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
+  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
+  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
+  r0 = _mm256_shuffle_epi8(r0, sf2);
+  r1 = _mm256_shuffle_epi8(r1, sf2);
+  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
+  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
+}
+
+static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
+                                      const ptrdiff_t pitch, __m256i *sig) {
+  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
+                                       __m256i *sig /*sig[2]*/) {
+  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+  pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
+                                      __m256i *sig /*sig[2]*/) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
+  r0 = _mm256_permutevar8x32_epi32(r0, idx);
+  r0 = _mm256_shuffle_epi8(r0, sf2);
+  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
+}
+
+// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
+static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
+                                       __m256i *y0, __m256i *y1) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
+  x0 = _mm256_add_epi32(x0, rounding);
+  x1 = _mm256_add_epi32(x1, rounding);
+  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
+                                        __m256i *y0) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+  x0 = _mm256_add_epi32(x0, rounding);
+  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+}
+
+static void vpx_highbd_filter_block1d8_h2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[2], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff;
+  pack_2t_filter(filter, &ff);
+
+  src_ptr -= 3;
+  do {
+    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+    filter_16_2t_pixels(signal, &ff, &res0, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    height -= 2;
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+  } while (height > 1);
+
+  if (height > 0) {
+    pack_8x1_2t_pixels(src_ptr, signal);
+    filter_8x1_2t_pixels(signal, &ff, &res0);
+    store_8x1_pixels(&res0, &max, dst_ptr);
+  }
+}
+
+static void vpx_highbd_filter_block1d16_h2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[2], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff;
+  pack_2t_filter(filter, &ff);
+
+  src_ptr -= 3;
+  do {
+    pack_16x1_2t_pixels(src_ptr, signal);
+    filter_16_2t_pixels(signal, &ff, &res0, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+    height -= 1;
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+  } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// Vertical Filtering
+
+static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
+  __m256i s1 =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
+  __m256i s2 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
+  __m256i s3 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
+  __m256i s4 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
+  __m256i s5 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
+  __m256i s6 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
+
+  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
+  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
+  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
+  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
+  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
+
+  sig[0] = _mm256_unpacklo_epi16(s0, s1);
+  sig[4] = _mm256_unpackhi_epi16(s0, s1);
+  sig[1] = _mm256_unpacklo_epi16(s2, s3);
+  sig[5] = _mm256_unpackhi_epi16(s2, s3);
+  sig[2] = _mm256_unpacklo_epi16(s4, s5);
+  sig[6] = _mm256_unpackhi_epi16(s4, s5);
+  sig[8] = s6;
+}
+
+static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+                                   __m256i *sig) {
+  // base + 7th row
+  __m256i s0 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
+  // base + 8th row
+  __m256i s1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
+  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
+  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+  sig[3] = _mm256_unpacklo_epi16(s2, s3);
+  sig[7] = _mm256_unpackhi_epi16(s2, s3);
+  sig[8] = s1;
+}
+
+static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
+                                     __m256i *y0, __m256i *y1) {
+  filter_8x1_pixels(sig, f, y0);
+  filter_8x1_pixels(&sig[4], f, y1);
+}
+
+static INLINE void update_pixels(__m256i *sig) {
+  int i;
+  for (i = 0; i < 3; ++i) {
+    sig[i] = sig[i + 1];
+    sig[i + 4] = sig[i + 5];
+  }
+}
+
+static void vpx_highbd_filter_block1d8_v8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[9], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  pack_8x9_init(src_ptr, src_pitch, signal);
+
+  do {
+    pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+    filter_8x9_pixels(signal, ff, &res0, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    update_pixels(signal);
+
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+    height -= 2;
+  } while (height > 0);
+}
+
+static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+  __m256i u0, u1, u2, u3;
+  // load 0-6 rows
+  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
+  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
+  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
+  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
+  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
+
+  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
+  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
+
+  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
+  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
+
+  sig[0] = _mm256_unpacklo_epi16(u0, u2);
+  sig[4] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[8] = _mm256_unpacklo_epi16(u1, u3);
+  sig[12] = _mm256_unpackhi_epi16(u1, u3);
+
+  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
+  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
+  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
+
+  sig[1] = _mm256_unpacklo_epi16(u0, u2);
+  sig[5] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[9] = _mm256_unpacklo_epi16(u1, u3);
+  sig[13] = _mm256_unpackhi_epi16(u1, u3);
+
+  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
+  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
+  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
+
+  sig[2] = _mm256_unpacklo_epi16(u0, u2);
+  sig[6] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[10] = _mm256_unpacklo_epi16(u1, u3);
+  sig[14] = _mm256_unpackhi_epi16(u1, u3);
+
+  sig[16] = s6;
+}
+
+static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+                             __m256i *sig) {
+  // base + 7th row
+  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
+  // base + 8th row
+  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
+
+  __m256i u0, u1, u2, u3;
+  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
+  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
+  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
+
+  sig[3] = _mm256_unpacklo_epi16(u0, u2);
+  sig[7] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[11] = _mm256_unpacklo_epi16(u1, u3);
+  sig[15] = _mm256_unpackhi_epi16(u1, u3);
+
+  sig[16] = s8;
+}
+
+static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
+                                      __m256i *y0, __m256i *y1) {
+  __m256i res[4];
+  int i;
+  for (i = 0; i < 4; ++i) {
+    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
+  }
+
+  {
+    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
+    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
+    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
+    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
+  }
+}
+
+static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
+                                     const __m256i *mask, uint16_t *dst,
+                                     ptrdiff_t pitch) {
+  __m256i p = _mm256_min_epi16(*y0, *mask);
+  _mm256_storeu_si256((__m256i *)dst, p);
+  p = _mm256_min_epi16(*y1, *mask);
+  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static void update_16x9_pixels(__m256i *sig) {
+  update_pixels(&sig[0]);
+  update_pixels(&sig[8]);
+}
+
+static void vpx_highbd_filter_block1d16_v8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[17], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  pack_16x9_init(src_ptr, src_pitch, signal);
+
+  do {
+    pack_16x9_pixels(src_ptr, src_pitch, signal);
+    filter_16x9_pixels(signal, ff, &res0, &res1);
+    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    update_16x9_pixels(signal);
+
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+    height -= 2;
+  } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap vertical filtering
+
+static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
+  sig[2] = _mm256_loadu_si256((const __m256i *)src);
+}
+
+static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
+                                       __m256i *sig) {
+  // load the next row
+  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
+  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
+  sig[2] = u;
+}
+
+static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
+                                         __m256i *y0, __m256i *y1) {
+  filter_16_2t_pixels(sig, f, y0, y1);
+}
+
+static void vpx_highbd_filter_block1d16_v2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[3], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+  __m256i ff;
+
+  pack_2t_filter(filter, &ff);
+  pack_16x2_init(src_ptr, signal);
+
+  do {
+    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+    height -= 1;
+  } while (height > 0);
+}
+
+static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m128i p = _mm_set1_epi32(0x09080706);
+  f[0] = _mm_shuffle_epi8(h, p);
+}
+
+static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
+  sig[2] = _mm_loadu_si128((const __m128i *)src);
+}
+
+static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
+                                          __m128i *sig) {
+  // load the next row
+  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
+  sig[0] = _mm_unpacklo_epi16(sig[2], u);
+  sig[1] = _mm_unpackhi_epi16(sig[2], u);
+  sig[2] = u;
+}
+
+static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
+                                      __m128i *y0, __m128i *y1) {
+  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m128i x0 = _mm_madd_epi16(sig[0], *f);
+  __m128i x1 = _mm_madd_epi16(sig[1], *f);
+  x0 = _mm_add_epi32(x0, rounding);
+  x1 = _mm_add_epi32(x1, rounding);
+  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
+  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
+                                           const __m128i *mask, uint16_t *dst) {
+  __m128i res = _mm_packus_epi32(*y0, *y1);
+  res = _mm_min_epi16(res, *mask);
+  _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void vpx_highbd_filter_block1d8_v2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m128i signal[3], res0, res1;
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  __m128i ff;
+
+  pack_8x1_2t_filter(filter, &ff);
+  pack_8x2_init(src_ptr, signal);
+
+  do {
+    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+    filter_8_2t_pixels(signal, &ff, &res0, &res1);
+    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+    height -= 1;
+  } while (height > 0);
+}
+
+// Calculation with averaging the input pixels
+
+static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
+                                        uint16_t *dst) {
+  const __m128i a0 = _mm256_castsi256_si128(*y0);
+  const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
+  __m128i res = _mm_packus_epi32(a0, a1);
+  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
+  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+  res = _mm_avg_epu16(res, pix);
+  _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
+                                        const __m256i *mask, uint16_t *dst,
+                                        ptrdiff_t pitch) {
+  __m256i a = _mm256_packus_epi32(*y0, *y1);
+  const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
+  const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
+  const __m256i pix =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+  a = _mm256_min_epi16(a, *mask);
+  a = _mm256_avg_epu16(a, pix);
+  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
+                                         const __m256i *mask, uint16_t *dst) {
+  __m256i a = _mm256_packus_epi32(*y0, *y1);
+  const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
+  a = _mm256_min_epi16(a, *mask);
+  a = _mm256_avg_epu16(a, pix);
+  _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
+                                         const __m256i *mask, uint16_t *dst,
+                                         ptrdiff_t pitch) {
+  const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
+  const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
+  __m256i p = _mm256_min_epi16(*y0, *mask);
+  p = _mm256_avg_epu16(p, pix0);
+  _mm256_storeu_si256((__m256i *)dst, p);
+
+  p = _mm256_min_epi16(*y1, *mask);
+  p = _mm256_avg_epu16(p, pix1);
+  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
+                                               const __m128i *y1,
+                                               const __m128i *mask,
+                                               uint16_t *dst) {
+  __m128i res = _mm_packus_epi32(*y0, *y1);
+  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
+  res = _mm_min_epi16(res, *mask);
+  res = _mm_avg_epu16(res, pix);
+  _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void vpx_highbd_filter_block1d8_h8_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[8], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  src_ptr -= 3;
+  do {
+    pack_8x2_pixels(src_ptr, src_pitch, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    filter_8x1_pixels(&signal[4], ff, &res1);
+    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    height -= 2;
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+  } while (height > 1);
+
+  if (height > 0) {
+    pack_8x1_pixels(src_ptr, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    store_8x1_avg_pixels(&res0, &max, dst_ptr);
+  }
+}
+
+static void vpx_highbd_filter_block1d16_h8_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[8], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  src_ptr -= 3;
+  do {
+    pack_16x1_pixels(src_ptr, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    filter_8x1_pixels(&signal[4], ff, &res1);
+    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+    height -= 1;
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+  } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d8_v8_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[9], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  pack_8x9_init(src_ptr, src_pitch, signal);
+
+  do {
+    pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+    filter_8x9_pixels(signal, ff, &res0, &res1);
+    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    update_pixels(signal);
+
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+    height -= 2;
+  } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d16_v8_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[17], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  pack_16x9_init(src_ptr, src_pitch, signal);
+
+  do {
+    pack_16x9_pixels(src_ptr, src_pitch, signal);
+    filter_16x9_pixels(signal, ff, &res0, &res1);
+    store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    update_16x9_pixels(signal);
+
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+    height -= 2;
+  } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d8_h2_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[2], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff;
+  pack_2t_filter(filter, &ff);
+
+  src_ptr -= 3;
+  do {
+    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+    filter_16_2t_pixels(signal, &ff, &res0, &res1);
+    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    height -= 2;
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+  } while (height > 1);
+
+  if (height > 0) {
+    pack_8x1_2t_pixels(src_ptr, signal);
+    filter_8x1_2t_pixels(signal, &ff, &res0);
+    store_8x1_avg_pixels(&res0, &max, dst_ptr);
+  }
+}
+
+static void vpx_highbd_filter_block1d16_h2_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[2], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff;
+  pack_2t_filter(filter, &ff);
+
+  src_ptr -= 3;
+  do {
+    pack_16x1_2t_pixels(src_ptr, signal);
+    filter_16_2t_pixels(signal, &ff, &res0, &res1);
+    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+    height -= 1;
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+  } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d16_v2_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[3], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+  __m256i ff;
+
+  pack_2t_filter(filter, &ff);
+  pack_16x2_init(src_ptr, signal);
+
+  do {
+    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+    height -= 1;
+  } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d8_v2_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m128i signal[3], res0, res1;
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  __m128i ff;
+
+  pack_8x1_2t_filter(filter, &ff);
+  pack_8x2_init(src_ptr, signal);
+
+  do {
+    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+    filter_8_2t_pixels(signal, &ff, &res0, &res1);
+    store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+    height -= 1;
+  } while (height > 0);
+}
+
+void vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+void vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+void vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
+#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
+#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
+#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
+
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
+HIGH_FUN_CONV_2D(, avx2);
+
+void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,
+                                            uint16_t *, ptrdiff_t, uint32_t,
+                                            const int16_t *, int);
+void vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t,
+                                            uint16_t *, ptrdiff_t, uint32_t,
+                                            const int16_t *, int);
+void vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t,
+                                            uint16_t *, ptrdiff_t, uint32_t,
+                                            const int16_t *, int);
+void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
+                                            uint16_t *, ptrdiff_t, uint32_t,
+                                            const int16_t *, int);
+#define vpx_highbd_filter_block1d4_h8_avg_avx2 \
+  vpx_highbd_filter_block1d4_h8_avg_sse2
+#define vpx_highbd_filter_block1d4_h2_avg_avx2 \
+  vpx_highbd_filter_block1d4_h2_avg_sse2
+#define vpx_highbd_filter_block1d4_v8_avg_avx2 \
+  vpx_highbd_filter_block1d4_v8_avg_sse2
+#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
+  vpx_highbd_filter_block1d4_v2_avg_sse2
+
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
+                 avx2);
+HIGH_FUN_CONV_2D(avg_, avx2);
+
+#undef HIGHBD_FUNC
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
new file mode 100644
index 0000000000..f4f7235d13
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -0,0 +1,355 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
+                                             __m128i *const out) {
+  // stage 5
+  out[0] = _mm_add_epi32(in[0], in[3]);
+  out[1] = _mm_add_epi32(in[1], in[2]);
+  out[2] = _mm_sub_epi32(in[1], in[2]);
+  out[3] = _mm_sub_epi32(in[0], in[3]);
+  highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]);
+  out[8] = _mm_add_epi32(in[8], in[11]);
+  out[9] = _mm_add_epi32(in[9], in[10]);
+  out[10] = _mm_sub_epi32(in[9], in[10]);
+  out[11] = _mm_sub_epi32(in[8], in[11]);
+  out[12] = _mm_sub_epi32(in[15], in[12]);
+  out[13] = _mm_sub_epi32(in[14], in[13]);
+  out[14] = _mm_add_epi32(in[14], in[13]);
+  out[15] = _mm_add_epi32(in[15], in[12]);
+}
+
+static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
+                                             __m128i *const out) {
+  out[0] = _mm_add_epi32(in[0], in[7]);
+  out[1] = _mm_add_epi32(in[1], in[6]);
+  out[2] = _mm_add_epi32(in[2], in[5]);
+  out[3] = _mm_add_epi32(in[3], in[4]);
+  out[4] = _mm_sub_epi32(in[3], in[4]);
+  out[5] = _mm_sub_epi32(in[2], in[5]);
+  out[6] = _mm_sub_epi32(in[1], in[6]);
+  out[7] = _mm_sub_epi32(in[0], in[7]);
+  out[8] = in[8];
+  out[9] = in[9];
+  highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]);
+  highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]);
+  out[14] = in[14];
+  out[15] = in[15];
+}
+
+static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
+                        &step2[15]);
+  highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
+                        &step2[13]);
+  highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
+                        &step2[12]);
+
+  // stage 3
+  highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
+                        &step1[7]);
+  highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
+                        &step1[6]);
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
+  step1[11] = _mm_add_epi32(step2[10], step2[11]);
+  step1[12] = _mm_add_epi32(step2[13], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+  // stage 4
+  highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
+  highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
+                        &step2[3]);
+  highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+                        &step2[13], &step2[10]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step1[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step1[7] = _mm_add_epi32(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp1[2], sign[2];
+
+  // stage 2
+  highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
+                                &step2[15]);
+  highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9],
+                                    &step2[14]);
+  highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10],
+                                &step2[13]);
+  highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
+                                    &step2[12]);
+
+  // stage 3
+  highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
+                                &step1[7]);
+  highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5],
+                                    &step1[6]);
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
+  step1[11] = _mm_add_epi32(step2[10], step2[11]);
+  step1[12] = _mm_add_epi32(step2[13], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+  // stage 4
+  abs_extend_64bit_sse2(io[0], temp1, sign);
+  step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+  step2[1] = step2[0];
+  highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2],
+                                &step2[3]);
+  highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+                        &step2[13], &step2[10]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step1[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step1[7] = _mm_add_epi32(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp[2], sign[2];
+
+  // stage 2
+  highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
+                                &step2[15]);
+  highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
+                                    &step2[12]);
+
+  // stage 3
+  highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
+                                &step1[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[11]);  // step1[10] = -step1[10]
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[12]);  // step1[13] = -step1[13]
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  abs_extend_64bit_sse2(io[0], temp, sign);
+  step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64);
+  step2[1] = step2[0];
+  step2[2] = _mm_setzero_si128();
+  step2[3] = _mm_setzero_si128();
+  highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+                        &step2[13], &step2[10]);
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                       int stride, int bd) {
+  int i;
+  __m128i out[16], *in;
+
+  if (bd == 8) {
+    __m128i l[16], r[16];
+
+    in = l;
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+      idct16_8col(in, in);
+      in = r;
+      input += 128;
+    }
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(l + i, out);
+      transpose_16bit_8x8(r + i, out + 8);
+      idct16_8col(out, out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[4][16];
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+      highbd_idct16_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct16_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  int i;
+  __m128i out[16];
+
+  if (bd == 8) {
+    __m128i in[16], temp[16];
+
+    highbd_load_pack_transpose_32bit_8x8(input, 16, in);
+    for (i = 8; i < 16; i++) {
+      in[i] = _mm_setzero_si128();
+    }
+    idct16_8col(in, temp);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(temp + i, in);
+      idct16_8col(in, out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[2][16], *in;
+
+    for (i = 0; i < 2; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(input, 16, in);
+      highbd_idct16x16_38_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      highbd_idct16x16_38_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  int i;
+  __m128i out[16];
+
+  if (bd == 8) {
+    __m128i in[16], l[16];
+
+    in[0] = load_pack_8_32bit(input + 0 * 16);
+    in[1] = load_pack_8_32bit(input + 1 * 16);
+    in[2] = load_pack_8_32bit(input + 2 * 16);
+    in[3] = load_pack_8_32bit(input + 3 * 16);
+
+    idct16x16_10_pass1(in, l);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      idct16x16_10_pass2(l + i, in);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, in[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[2][16], *in;
+
+    for (i = 0; i < 2; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_4x4(input, 16, in);
+      highbd_idct16x16_10_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(&all[0][i], out);
+      highbd_idct16x16_10_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int bd) {
+  highbd_idct_1_add_kernel(input, dest, stride, bd, 16);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
new file mode 100644
index 0000000000..de097c66a6
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
@@ -0,0 +1,349 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
+                                             __m128i *const out) {
+  // stage 5
+  out[0] = _mm_add_epi32(in[0], in[3]);
+  out[1] = _mm_add_epi32(in[1], in[2]);
+  out[2] = _mm_sub_epi32(in[1], in[2]);
+  out[3] = _mm_sub_epi32(in[0], in[3]);
+  highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]);
+  out[8] = _mm_add_epi32(in[8], in[11]);
+  out[9] = _mm_add_epi32(in[9], in[10]);
+  out[10] = _mm_sub_epi32(in[9], in[10]);
+  out[11] = _mm_sub_epi32(in[8], in[11]);
+  out[12] = _mm_sub_epi32(in[15], in[12]);
+  out[13] = _mm_sub_epi32(in[14], in[13]);
+  out[14] = _mm_add_epi32(in[14], in[13]);
+  out[15] = _mm_add_epi32(in[15], in[12]);
+}
+
+static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
+                                             __m128i *const out) {
+  out[0] = _mm_add_epi32(in[0], in[7]);
+  out[1] = _mm_add_epi32(in[1], in[6]);
+  out[2] = _mm_add_epi32(in[2], in[5]);
+  out[3] = _mm_add_epi32(in[3], in[4]);
+  out[4] = _mm_sub_epi32(in[3], in[4]);
+  out[5] = _mm_sub_epi32(in[2], in[5]);
+  out[6] = _mm_sub_epi32(in[1], in[6]);
+  out[7] = _mm_sub_epi32(in[0], in[7]);
+  out[8] = in[8];
+  out[9] = in[9];
+  highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]);
+  highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]);
+  out[14] = in[14];
+  out[15] = in[15];
+}
+
+static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  highbd_butterfly_sse4_1(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
+                          &step2[15]);
+  highbd_butterfly_sse4_1(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
+                          &step2[14]);
+  highbd_butterfly_sse4_1(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
+                          &step2[13]);
+  highbd_butterfly_sse4_1(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
+                          &step2[12]);
+
+  // stage 3
+  highbd_butterfly_sse4_1(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
+                          &step1[7]);
+  highbd_butterfly_sse4_1(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
+                          &step1[6]);
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+  step1[11] = _mm_add_epi32(step2[11], step2[10]);
+  step1[12] = _mm_add_epi32(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+  // stage 4
+  highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]);
+  highbd_butterfly_sse4_1(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
+                          &step2[3]);
+  highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+                          &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+                          &step2[13], &step2[10]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step1[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step1[7] = _mm_add_epi32(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp1[2];
+
+  // stage 2
+  highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
+                                  &step2[15]);
+  highbd_partial_butterfly_sse4_1(io[7], -cospi_18_64, cospi_14_64, &step2[9],
+                                  &step2[14]);
+  highbd_partial_butterfly_sse4_1(io[5], cospi_22_64, cospi_10_64, &step2[10],
+                                  &step2[13]);
+  highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
+                                  &step2[12]);
+
+  // stage 3
+  highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
+                                  &step1[7]);
+  highbd_partial_butterfly_sse4_1(io[6], -cospi_20_64, cospi_12_64, &step1[5],
+                                  &step1[6]);
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+  step1[11] = _mm_add_epi32(step2[11], step2[10]);
+  step1[12] = _mm_add_epi32(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+  // stage 4
+  extend_64bit(io[0], temp1);
+  step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+  step2[1] = step2[0];
+  highbd_partial_butterfly_sse4_1(io[4], cospi_24_64, cospi_8_64, &step2[2],
+                                  &step2[3]);
+  highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+                          &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+                          &step2[13], &step2[10]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step1[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step1[7] = _mm_add_epi32(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp[2];
+
+  // stage 2
+  highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
+                                  &step2[15]);
+  highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
+                                  &step2[12]);
+
+  // stage 3
+  highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
+                                  &step1[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  extend_64bit(io[0], temp);
+  step2[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+  step2[1] = step2[0];
+  step2[2] = _mm_setzero_si128();
+  step2[3] = _mm_setzero_si128();
+  highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+                          &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+                          &step2[13], &step2[10]);
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
+                                         uint16_t *dest, int stride, int bd) {
+  int i;
+  __m128i out[16], *in;
+
+  if (bd == 8) {
+    __m128i l[16], r[16];
+
+    in = l;
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+      idct16_8col(in, in);
+      in = r;
+      input += 128;
+    }
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(l + i, out);
+      transpose_16bit_8x8(r + i, out + 8);
+      idct16_8col(out, out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[4][16];
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+      highbd_idct16_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct16_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  int i;
+  __m128i out[16];
+
+  if (bd == 8) {
+    __m128i in[16], temp[16];
+
+    highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+    for (i = 8; i < 16; i++) {
+      in[i] = _mm_setzero_si128();
+    }
+    idct16_8col(in, temp);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(temp + i, in);
+      idct16_8col(in, out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[2][16], *in;
+
+    for (i = 0; i < 2; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(input, 16, in);
+      highbd_idct16x16_38_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      highbd_idct16x16_38_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  int i;
+  __m128i out[16];
+
+  if (bd == 8) {
+    __m128i in[16], l[16];
+
+    in[0] = load_pack_8_32bit(input + 0 * 16);
+    in[1] = load_pack_8_32bit(input + 1 * 16);
+    in[2] = load_pack_8_32bit(input + 2 * 16);
+    in[3] = load_pack_8_32bit(input + 3 * 16);
+
+    idct16x16_10_pass1(in, l);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      idct16x16_10_pass2(l + i, in);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, in[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[2][16], *in;
+
+    for (i = 0; i < 2; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_4x4(input, 16, in);
+      highbd_idct16x16_10_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(&all[0][i], out);
+      highbd_idct16x16_10_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
new file mode 100644
index 0000000000..c710e89954
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
@@ -0,0 +1,782 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6(
+    __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+                        &step2[13], &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[8] = _mm_add_epi32(step2[8], step2[11]);
+  step1[9] = _mm_add_epi32(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi32(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi32(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi32(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[14], step2[13]);
+  step1[14] = _mm_add_epi32(step2[14], step2[13]);
+  step1[15] = _mm_add_epi32(step2[15], step2[12]);
+
+  // stage 6
+  out[8] = step1[8];
+  out[9] = step1[9];
+  highbd_butterfly_sse2(step1[13], step1[10], cospi_16_64, cospi_16_64,
+                        &out[10], &out[13]);
+  highbd_butterfly_sse2(step1[12], step1[11], cospi_16_64, cospi_16_64,
+                        &out[11], &out[12]);
+  out[14] = step1[14];
+  out[15] = step1[15];
+}
+
+static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7(
+    __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[16] = _mm_add_epi32(step1[16], step1[19]);
+  step2[17] = _mm_add_epi32(step1[17], step1[18]);
+  step2[18] = _mm_sub_epi32(step1[17], step1[18]);
+  step2[19] = _mm_sub_epi32(step1[16], step1[19]);
+  step2[20] = _mm_sub_epi32(step1[20], step1[23]);  // step2[20] = -step2[20]
+  step2[21] = _mm_sub_epi32(step1[21], step1[22]);  // step2[21] = -step2[21]
+  step2[22] = _mm_add_epi32(step1[21], step1[22]);
+  step2[23] = _mm_add_epi32(step1[20], step1[23]);
+
+  step2[24] = _mm_add_epi32(step1[27], step1[24]);
+  step2[25] = _mm_add_epi32(step1[26], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[26], step1[25]);  // step2[26] = -step2[26]
+  step2[27] = _mm_sub_epi32(step1[27], step1[24]);  // step2[27] = -step2[27]
+  step2[28] = _mm_sub_epi32(step1[31], step1[28]);
+  step2[29] = _mm_sub_epi32(step1[30], step1[29]);
+  step2[30] = _mm_add_epi32(step1[29], step1[30]);
+  step2[31] = _mm_add_epi32(step1[28], step1[31]);
+
+  // stage 5
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  highbd_butterfly_sse2(step2[29], step2[18], cospi_24_64, cospi_8_64,
+                        &step1[18], &step1[29]);
+  highbd_butterfly_sse2(step2[28], step2[19], cospi_24_64, cospi_8_64,
+                        &step1[19], &step1[28]);
+  highbd_butterfly_sse2(step2[20], step2[27], cospi_8_64, cospi_24_64,
+                        &step1[27], &step1[20]);
+  highbd_butterfly_sse2(step2[21], step2[26], cospi_8_64, cospi_24_64,
+                        &step1[26], &step1[21]);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[16] = _mm_add_epi32(step1[16], step1[23]);
+  step2[17] = _mm_add_epi32(step1[17], step1[22]);
+  step2[18] = _mm_add_epi32(step1[18], step1[21]);
+  step2[19] = _mm_add_epi32(step1[19], step1[20]);
+  step2[20] = _mm_sub_epi32(step1[19], step1[20]);
+  step2[21] = _mm_sub_epi32(step1[18], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[17], step1[22]);
+  step2[23] = _mm_sub_epi32(step1[16], step1[23]);
+
+  step2[24] = _mm_sub_epi32(step1[31], step1[24]);
+  step2[25] = _mm_sub_epi32(step1[30], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[29], step1[26]);
+  step2[27] = _mm_sub_epi32(step1[28], step1[27]);
+  step2[28] = _mm_add_epi32(step1[27], step1[28]);
+  step2[29] = _mm_add_epi32(step1[26], step1[29]);
+  step2[30] = _mm_add_epi32(step1[25], step1[30]);
+  step2[31] = _mm_add_epi32(step1[24], step1[31]);
+
+  // stage 7
+  out[16] = step2[16];
+  out[17] = step2[17];
+  out[18] = step2[18];
+  out[19] = step2[19];
+  highbd_butterfly_sse2(step2[27], step2[20], cospi_16_64, cospi_16_64,
+                        &out[20], &out[27]);
+  highbd_butterfly_sse2(step2[26], step2[21], cospi_16_64, cospi_16_64,
+                        &out[21], &out[26]);
+  highbd_butterfly_sse2(step2[25], step2[22], cospi_16_64, cospi_16_64,
+                        &out[22], &out[25]);
+  highbd_butterfly_sse2(step2[24], step2[23], cospi_16_64, cospi_16_64,
+                        &out[23], &out[24]);
+  out[28] = step2[28];
+  out[29] = step2[29];
+  out[30] = step2[30];
+  out[31] = step2[31];
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_butterfly_sse2(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4],
+                        &step1[7]);
+  highbd_butterfly_sse2(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5],
+                        &step1[6]);
+
+  // stage 4
+  highbd_butterfly_sse2(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1],
+                        &step2[0]);
+  highbd_butterfly_sse2(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2],
+                        &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+                        &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_2(
+    const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_butterfly_sse2(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8],
+                        &step2[15]);
+  highbd_butterfly_sse2(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10],
+                        &step2[13]);
+  highbd_butterfly_sse2(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11],
+                        &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
+  step1[11] = _mm_add_epi32(step2[10], step2[11]);
+  step1[12] = _mm_add_epi32(step2[13], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_1024_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_1024_4x32_quarter_1(in, temp);
+  highbd_idct32_1024_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_butterfly_sse2(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16],
+                        &step1[31]);
+  highbd_butterfly_sse2(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17],
+                        &step1[30]);
+  highbd_butterfly_sse2(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18],
+                        &step1[29]);
+  highbd_butterfly_sse2(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19],
+                        &step1[28]);
+
+  highbd_butterfly_sse2(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20],
+                        &step1[27]);
+  highbd_butterfly_sse2(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21],
+                        &step1[26]);
+
+  highbd_butterfly_sse2(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22],
+                        &step1[25]);
+  highbd_butterfly_sse2(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23],
+                        &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi32(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi32(step1[18], step1[19]);  // step2[18] = -step2[18]
+  step2[19] = _mm_add_epi32(step1[18], step1[19]);
+  step2[20] = _mm_add_epi32(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[22], step1[23]);  // step2[22] = -step2[22]
+  step2[23] = _mm_add_epi32(step1[22], step1[23]);
+
+  step2[24] = _mm_add_epi32(step1[25], step1[24]);
+  step2[25] = _mm_sub_epi32(step1[25], step1[24]);  // step2[25] = -step2[25]
+  step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+  step2[27] = _mm_add_epi32(step1[27], step1[26]);
+  step2[28] = _mm_add_epi32(step1[29], step1[28]);
+  step2[29] = _mm_sub_epi32(step1[29], step1[28]);  // step2[29] = -step2[29]
+  step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+  step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                        &step1[17], &step1[30]);
+  highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+                        &step1[29], &step1[18]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                        &step1[21], &step1[26]);
+  highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+                        &step1[25], &step1[22]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_1024_4x32_quarter_1_2(io, temp);
+  highbd_idct32_1024_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[4][32], io[32];
+
+    // rows
+    for (i = 0; i < 4; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]);
+      highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]);
+      highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]);
+      idct32_1024_8x32(io, col[i]);
+      input += 32 << 3;
+    }
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      // Transpose 32x8 block to 8x32 block
+      transpose_16bit_8x8(col[0] + i, io);
+      transpose_16bit_8x8(col[1] + i, io + 8);
+      transpose_16bit_8x8(col[2] + i, io + 16);
+      transpose_16bit_8x8(col[3] + i, io + 24);
+      idct32_1024_8x32(io, io);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, io[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 8; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]);
+      highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]);
+      highbd_idct32_1024_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      transpose_32bit_4x4(all[4] + i, out + 16);
+      transpose_32bit_4x4(all[5] + i, out + 20);
+      transpose_32bit_4x4(all[6] + i, out + 24);
+      transpose_32bit_4x4(all[7] + i, out + 28);
+      highbd_idct32_1024_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4],
+                                &step1[7]);
+  highbd_partial_butterfly_neg_sse2(in[12], cospi_12_64, cospi_20_64, &step1[5],
+                                    &step1[6]);
+
+  // stage 4
+  highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1],
+                                &step2[0]);
+  highbd_partial_butterfly_sse2(in[8], cospi_24_64, cospi_8_64, &step2[2],
+                                &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+                        &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_2(
+    const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                                &step2[15]);
+  highbd_partial_butterfly_neg_sse2(in[14], cospi_14_64, cospi_18_64, &step2[9],
+                                    &step2[14]);
+  highbd_partial_butterfly_sse2(in[10], cospi_22_64, cospi_10_64, &step2[10],
+                                &step2[13]);
+  highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11],
+                                    &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
+  step1[11] = _mm_add_epi32(step2[10], step2[11]);
+  step1[12] = _mm_add_epi32(step2[13], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_135_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_135_4x32_quarter_1(in, temp);
+  highbd_idct32_135_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                                &step1[31]);
+  highbd_partial_butterfly_neg_sse2(in[15], cospi_15_64, cospi_17_64,
+                                    &step1[17], &step1[30]);
+  highbd_partial_butterfly_sse2(in[9], cospi_23_64, cospi_9_64, &step1[18],
+                                &step1[29]);
+  highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19],
+                                    &step1[28]);
+
+  highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                                &step1[27]);
+  highbd_partial_butterfly_neg_sse2(in[11], cospi_11_64, cospi_21_64,
+                                    &step1[21], &step1[26]);
+
+  highbd_partial_butterfly_sse2(in[13], cospi_19_64, cospi_13_64, &step1[22],
+                                &step1[25]);
+  highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23],
+                                    &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi32(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi32(step1[18], step1[19]);  // step2[18] = -step2[18]
+  step2[19] = _mm_add_epi32(step1[18], step1[19]);
+  step2[20] = _mm_add_epi32(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[22], step1[23]);  // step2[22] = -step2[22]
+  step2[23] = _mm_add_epi32(step1[22], step1[23]);
+
+  step2[24] = _mm_add_epi32(step1[25], step1[24]);
+  step2[25] = _mm_sub_epi32(step1[25], step1[24]);  // step2[25] = -step2[25]
+  step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+  step2[27] = _mm_add_epi32(step1[27], step1[26]);
+  step2[28] = _mm_add_epi32(step1[29], step1[28]);
+  step2[29] = _mm_sub_epi32(step1[29], step1[28]);  // step2[29] = -step2[29]
+  step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+  step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                        &step1[17], &step1[30]);
+  highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+                        &step1[29], &step1[18]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                        &step1[21], &step1[26]);
+  highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+                        &step1[25], &step1[22]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_135_4x32_quarter_1_2(io, temp);
+  highbd_idct32_135_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                       int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[2][32], in[32], out[32];
+
+    for (i = 16; i < 32; i++) {
+      in[i] = _mm_setzero_si128();
+    }
+
+    // rows
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]);
+      idct32_1024_8x32(in, col[i]);
+      input += 32 << 3;
+    }
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      transpose_16bit_8x8(col[0] + i, in);
+      transpose_16bit_8x8(col[1] + i, in + 8);
+      idct32_1024_8x32(in, out);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_idct32_135_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct32_135_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4],
+                                &step1[7]);
+
+  // stage 4
+  highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1],
+                                &step2[0]);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+                        &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/,
+                                                   __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                                &step2[15]);
+  highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11],
+                                    &step2[12]);
+
+  // stage 3
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+
+  step1[10] =
+      _mm_sub_epi32(_mm_setzero_si128(), step1[10]);  // step1[10] = -step1[10]
+  step1[13] =
+      _mm_sub_epi32(_mm_setzero_si128(), step1[13]);  // step1[13] = -step1[13]
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_34_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_34_4x32_quarter_1(in, temp);
+  highbd_idct32_34_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                                &step1[31]);
+  highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19],
+                                    &step1[28]);
+
+  highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                                &step1[27]);
+  highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23],
+                                    &step1[24]);
+
+  // stage 2
+  step2[16] = step1[16];
+  step2[17] = step1[16];
+  step2[18] = step1[19];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[21] = step1[20];
+  step2[22] = step1[23];
+  step2[23] = step1[23];
+
+  step2[24] = step1[24];
+  step2[25] = step1[24];
+  step2[26] = step1[27];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[29] = step1[28];
+  step2[30] = step1[31];
+  step2[31] = step1[31];
+
+  // stage 3
+  step2[18] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[18]);  // step2[18] = -step2[18]
+  step2[22] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[22]);  // step2[22] = -step2[22]
+  step2[25] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[25]);  // step2[25] = -step2[25]
+  step2[29] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[29]);  // step2[29] = -step2[29]
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                        &step1[17], &step1[30]);
+  highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+                        &step1[29], &step1[18]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                        &step1[21], &step1[26]);
+  highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+                        &step1[25], &step1[22]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_34_4x32_quarter_1_2(io, temp);
+  highbd_idct32_34_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[32], in[32], out[32];
+
+    // rows
+    highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+    idct32_34_8x32_sse2(in, col);
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      transpose_16bit_8x8(col + i, in);
+      idct32_34_8x32_sse2(in, out);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_idct32_34_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct32_34_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int bd) {
+  highbd_idct_1_add_kernel(input, dest, stride, bd, 32);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c b/libs/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c
new file mode 100644
index 0000000000..2d0a53ac0a
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c
@@ -0,0 +1,765 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6(
+    __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+                          &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[13], step1[10], -cospi_8_64, cospi_24_64,
+                          &step2[10], &step2[13]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[8] = _mm_add_epi32(step2[8], step2[11]);
+  step1[9] = _mm_add_epi32(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi32(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi32(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi32(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[14], step2[13]);
+  step1[14] = _mm_add_epi32(step2[14], step2[13]);
+  step1[15] = _mm_add_epi32(step2[15], step2[12]);
+
+  // stage 6
+  out[8] = step1[8];
+  out[9] = step1[9];
+  highbd_butterfly_sse4_1(step1[13], step1[10], cospi_16_64, cospi_16_64,
+                          &out[10], &out[13]);
+  highbd_butterfly_sse4_1(step1[12], step1[11], cospi_16_64, cospi_16_64,
+                          &out[11], &out[12]);
+  out[14] = step1[14];
+  out[15] = step1[15];
+}
+
+static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7(
+    __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[16] = _mm_add_epi32(step1[16], step1[19]);
+  step2[17] = _mm_add_epi32(step1[17], step1[18]);
+  step2[18] = _mm_sub_epi32(step1[17], step1[18]);
+  step2[19] = _mm_sub_epi32(step1[16], step1[19]);
+  step2[20] = _mm_sub_epi32(step1[23], step1[20]);
+  step2[21] = _mm_sub_epi32(step1[22], step1[21]);
+  step2[22] = _mm_add_epi32(step1[22], step1[21]);
+  step2[23] = _mm_add_epi32(step1[23], step1[20]);
+
+  step2[24] = _mm_add_epi32(step1[24], step1[27]);
+  step2[25] = _mm_add_epi32(step1[25], step1[26]);
+  step2[26] = _mm_sub_epi32(step1[25], step1[26]);
+  step2[27] = _mm_sub_epi32(step1[24], step1[27]);
+  step2[28] = _mm_sub_epi32(step1[31], step1[28]);
+  step2[29] = _mm_sub_epi32(step1[30], step1[29]);
+  step2[30] = _mm_add_epi32(step1[29], step1[30]);
+  step2[31] = _mm_add_epi32(step1[28], step1[31]);
+
+  // stage 5
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  highbd_butterfly_sse4_1(step2[29], step2[18], cospi_24_64, cospi_8_64,
+                          &step1[18], &step1[29]);
+  highbd_butterfly_sse4_1(step2[28], step2[19], cospi_24_64, cospi_8_64,
+                          &step1[19], &step1[28]);
+  highbd_butterfly_sse4_1(step2[27], step2[20], -cospi_8_64, cospi_24_64,
+                          &step1[20], &step1[27]);
+  highbd_butterfly_sse4_1(step2[26], step2[21], -cospi_8_64, cospi_24_64,
+                          &step1[21], &step1[26]);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[16] = _mm_add_epi32(step1[16], step1[23]);
+  step2[17] = _mm_add_epi32(step1[17], step1[22]);
+  step2[18] = _mm_add_epi32(step1[18], step1[21]);
+  step2[19] = _mm_add_epi32(step1[19], step1[20]);
+  step2[20] = _mm_sub_epi32(step1[19], step1[20]);
+  step2[21] = _mm_sub_epi32(step1[18], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[17], step1[22]);
+  step2[23] = _mm_sub_epi32(step1[16], step1[23]);
+
+  step2[24] = _mm_sub_epi32(step1[31], step1[24]);
+  step2[25] = _mm_sub_epi32(step1[30], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[29], step1[26]);
+  step2[27] = _mm_sub_epi32(step1[28], step1[27]);
+  step2[28] = _mm_add_epi32(step1[27], step1[28]);
+  step2[29] = _mm_add_epi32(step1[26], step1[29]);
+  step2[30] = _mm_add_epi32(step1[25], step1[30]);
+  step2[31] = _mm_add_epi32(step1[24], step1[31]);
+
+  // stage 7
+  out[16] = step2[16];
+  out[17] = step2[17];
+  out[18] = step2[18];
+  out[19] = step2[19];
+  highbd_butterfly_sse4_1(step2[27], step2[20], cospi_16_64, cospi_16_64,
+                          &out[20], &out[27]);
+  highbd_butterfly_sse4_1(step2[26], step2[21], cospi_16_64, cospi_16_64,
+                          &out[21], &out[26]);
+  highbd_butterfly_sse4_1(step2[25], step2[22], cospi_16_64, cospi_16_64,
+                          &out[22], &out[25]);
+  highbd_butterfly_sse4_1(step2[24], step2[23], cospi_16_64, cospi_16_64,
+                          &out[23], &out[24]);
+  out[28] = step2[28];
+  out[29] = step2[29];
+  out[30] = step2[30];
+  out[31] = step2[31];
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_butterfly_sse4_1(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4],
+                          &step1[7]);
+  highbd_butterfly_sse4_1(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5],
+                          &step1[6]);
+
+  // stage 4
+  highbd_butterfly_sse4_1(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1],
+                          &step2[0]);
+  highbd_butterfly_sse4_1(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2],
+                          &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+                          &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_2(
+    const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_butterfly_sse4_1(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8],
+                          &step2[15]);
+  highbd_butterfly_sse4_1(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9],
+                          &step2[14]);
+  highbd_butterfly_sse4_1(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10],
+                          &step2[13]);
+  highbd_butterfly_sse4_1(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11],
+                          &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+  step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+  step1[11] = _mm_add_epi32(step2[11], step2[10]);
+  step1[12] = _mm_add_epi32(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_1024_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_1024_4x32_quarter_1(in, temp);
+  highbd_idct32_1024_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_butterfly_sse4_1(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16],
+                          &step1[31]);
+  highbd_butterfly_sse4_1(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17],
+                          &step1[30]);
+  highbd_butterfly_sse4_1(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18],
+                          &step1[29]);
+  highbd_butterfly_sse4_1(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19],
+                          &step1[28]);
+
+  highbd_butterfly_sse4_1(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20],
+                          &step1[27]);
+  highbd_butterfly_sse4_1(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21],
+                          &step1[26]);
+
+  highbd_butterfly_sse4_1(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22],
+                          &step1[25]);
+  highbd_butterfly_sse4_1(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23],
+                          &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi32(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi32(step1[19], step1[18]);
+  step2[19] = _mm_add_epi32(step1[19], step1[18]);
+  step2[20] = _mm_add_epi32(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[23], step1[22]);
+  step2[23] = _mm_add_epi32(step1[23], step1[22]);
+
+  step2[24] = _mm_add_epi32(step1[24], step1[25]);
+  step2[25] = _mm_sub_epi32(step1[24], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+  step2[27] = _mm_add_epi32(step1[27], step1[26]);
+  step2[28] = _mm_add_epi32(step1[28], step1[29]);
+  step2[29] = _mm_sub_epi32(step1[28], step1[29]);
+  step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+  step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                          &step1[17], &step1[30]);
+  highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+                          &step1[18], &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                          &step1[21], &step1[26]);
+  highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+                          &step1[22], &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_1024_4x32_quarter_1_2(io, temp);
+  highbd_idct32_1024_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_1024_add_sse4_1(const tran_low_t *input,
+                                          uint16_t *dest, int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[4][32], io[32];
+
+    // rows
+    for (i = 0; i < 4; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]);
+      highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]);
+      highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]);
+      idct32_1024_8x32(io, col[i]);
+      input += 32 << 3;
+    }
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      // Transpose 32x8 block to 8x32 block
+      transpose_16bit_8x8(col[0] + i, io);
+      transpose_16bit_8x8(col[1] + i, io + 8);
+      transpose_16bit_8x8(col[2] + i, io + 16);
+      transpose_16bit_8x8(col[3] + i, io + 24);
+      idct32_1024_8x32(io, io);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, io[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 8; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]);
+      highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]);
+      highbd_idct32_1024_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      transpose_32bit_4x4(all[4] + i, out + 16);
+      transpose_32bit_4x4(all[5] + i, out + 20);
+      transpose_32bit_4x4(all[6] + i, out + 24);
+      transpose_32bit_4x4(all[7] + i, out + 28);
+      highbd_idct32_1024_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
+                                  &step1[7]);
+  highbd_partial_butterfly_sse4_1(in[12], -cospi_20_64, cospi_12_64, &step1[5],
+                                  &step1[6]);
+
+  // stage 4
+  highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
+                                  &step2[0]);
+  highbd_partial_butterfly_sse4_1(in[8], cospi_24_64, cospi_8_64, &step2[2],
+                                  &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+                          &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_2(
+    const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                                  &step2[15]);
+  highbd_partial_butterfly_sse4_1(in[14], -cospi_18_64, cospi_14_64, &step2[9],
+                                  &step2[14]);
+  highbd_partial_butterfly_sse4_1(in[10], cospi_22_64, cospi_10_64, &step2[10],
+                                  &step2[13]);
+  highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+                                  &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+  step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+  step1[11] = _mm_add_epi32(step2[11], step2[10]);
+  step1[12] = _mm_add_epi32(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_135_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_135_4x32_quarter_1(in, temp);
+  highbd_idct32_135_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                                  &step1[31]);
+  highbd_partial_butterfly_sse4_1(in[15], -cospi_17_64, cospi_15_64, &step1[17],
+                                  &step1[30]);
+  highbd_partial_butterfly_sse4_1(in[9], cospi_23_64, cospi_9_64, &step1[18],
+                                  &step1[29]);
+  highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+                                  &step1[28]);
+
+  highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                                  &step1[27]);
+  highbd_partial_butterfly_sse4_1(in[11], -cospi_21_64, cospi_11_64, &step1[21],
+                                  &step1[26]);
+
+  highbd_partial_butterfly_sse4_1(in[13], cospi_19_64, cospi_13_64, &step1[22],
+                                  &step1[25]);
+  highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+                                  &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi32(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi32(step1[19], step1[18]);
+  step2[19] = _mm_add_epi32(step1[19], step1[18]);
+  step2[20] = _mm_add_epi32(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[23], step1[22]);
+  step2[23] = _mm_add_epi32(step1[23], step1[22]);
+
+  step2[24] = _mm_add_epi32(step1[24], step1[25]);
+  step2[25] = _mm_sub_epi32(step1[24], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+  step2[27] = _mm_add_epi32(step1[27], step1[26]);
+  step2[28] = _mm_add_epi32(step1[28], step1[29]);
+  step2[29] = _mm_sub_epi32(step1[28], step1[29]);
+  step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+  step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                          &step1[17], &step1[30]);
+  highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+                          &step1[18], &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                          &step1[21], &step1[26]);
+  highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+                          &step1[22], &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_135_4x32_quarter_1_2(io, temp);
+  highbd_idct32_135_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_135_add_sse4_1(const tran_low_t *input,
+                                         uint16_t *dest, int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[2][32], in[32], out[32];
+
+    // rows
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]);
+      idct32_135_8x32_ssse3(in, col[i]);
+      input += 32 << 3;
+    }
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      transpose_16bit_8x8(col[0] + i, in);
+      transpose_16bit_8x8(col[1] + i, in + 8);
+      idct32_135_8x32_ssse3(in, out);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_idct32_135_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct32_135_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
+                                  &step1[7]);
+
+  // stage 4
+  highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
+                                  &step2[0]);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+                          &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/,
+                                                   __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                                  &step2[15]);
+  highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+                                  &step2[12]);
+
+  // stage 3
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_34_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_34_4x32_quarter_1(in, temp);
+  highbd_idct32_34_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                                  &step1[31]);
+  highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+                                  &step1[28]);
+
+  highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                                  &step1[27]);
+  highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+                                  &step1[24]);
+
+  // stage 2
+  step2[16] = step1[16];
+  step2[17] = step1[16];
+  step2[18] = step1[19];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[21] = step1[20];
+  step2[22] = step1[23];
+  step2[23] = step1[23];
+
+  step2[24] = step1[24];
+  step2[25] = step1[24];
+  step2[26] = step1[27];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[29] = step1[28];
+  step2[30] = step1[31];
+  step2[31] = step1[31];
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                          &step1[17], &step1[30]);
+  highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+                          &step1[18], &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                          &step1[21], &step1[26]);
+  highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+                          &step1[22], &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_34_4x32_quarter_1_2(io, temp);
+  highbd_idct32_34_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_34_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[32], in[32], out[32];
+
+    // rows
+    highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+    idct32_34_8x32_ssse3(in, col);
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      transpose_16bit_8x8(col + i, in);
+      idct32_34_8x32_ssse3(in, out);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_idct32_34_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct32_34_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
new file mode 100644
index 0000000000..2e54d24736
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -0,0 +1,160 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0,
+                                                   const __m128i in1) {
+  const __m128i t0 = _mm_unpacklo_epi32(in0, in1);  // 0, 1
+  const __m128i t1 = _mm_unpackhi_epi32(in0, in1);  // 2, 3
+  const __m128i t2 = _mm_unpacklo_epi64(t0, t1);    // 0, 1, 2, 3
+  return dct_const_round_shift_sse2(t2);
+}
+
+static INLINE void highbd_idct4_small_sse2(__m128i *const io) {
+  const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64, 0, cospi_16_64, 0);
+  const __m128i cospi_p08_p08 = _mm_setr_epi32(cospi_8_64, 0, cospi_8_64, 0);
+  const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64, 0, cospi_24_64, 0);
+  __m128i temp1[4], temp2[4], step[4];
+
+  transpose_32bit_4x4(io, io);
+
+  // Note: There is no 32-bit signed multiply SIMD instruction in SSE2.
+  //       _mm_mul_epu32() is used which can only guarantee the lower 32-bit
+  //       (signed) result is meaningful, which is enough in this function.
+
+  // stage 1
+  temp1[0] = _mm_add_epi32(io[0], io[2]);             // input[0] + input[2]
+  temp2[0] = _mm_sub_epi32(io[0], io[2]);             // input[0] - input[2]
+  temp1[1] = _mm_srli_si128(temp1[0], 4);             // 1, 3
+  temp2[1] = _mm_srli_si128(temp2[0], 4);             // 1, 3
+  temp1[0] = _mm_mul_epu32(temp1[0], cospi_p16_p16);  // ([0] + [2])*cospi_16_64
+  temp1[1] = _mm_mul_epu32(temp1[1], cospi_p16_p16);  // ([0] + [2])*cospi_16_64
+  temp2[0] = _mm_mul_epu32(temp2[0], cospi_p16_p16);  // ([0] - [2])*cospi_16_64
+  temp2[1] = _mm_mul_epu32(temp2[1], cospi_p16_p16);  // ([0] - [2])*cospi_16_64
+  step[0] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+  step[1] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+  temp1[3] = _mm_srli_si128(io[1], 4);
+  temp2[3] = _mm_srli_si128(io[3], 4);
+  temp1[0] = _mm_mul_epu32(io[1], cospi_p24_p24);     // input[1] * cospi_24_64
+  temp1[1] = _mm_mul_epu32(temp1[3], cospi_p24_p24);  // input[1] * cospi_24_64
+  temp2[0] = _mm_mul_epu32(io[1], cospi_p08_p08);     // input[1] * cospi_8_64
+  temp2[1] = _mm_mul_epu32(temp1[3], cospi_p08_p08);  // input[1] * cospi_8_64
+  temp1[2] = _mm_mul_epu32(io[3], cospi_p08_p08);     // input[3] * cospi_8_64
+  temp1[3] = _mm_mul_epu32(temp2[3], cospi_p08_p08);  // input[3] * cospi_8_64
+  temp2[2] = _mm_mul_epu32(io[3], cospi_p24_p24);     // input[3] * cospi_24_64
+  temp2[3] = _mm_mul_epu32(temp2[3], cospi_p24_p24);  // input[3] * cospi_24_64
+  temp1[0] = _mm_sub_epi64(temp1[0], temp1[2]);  // [1]*cospi_24 - [3]*cospi_8
+  temp1[1] = _mm_sub_epi64(temp1[1], temp1[3]);  // [1]*cospi_24 - [3]*cospi_8
+  temp2[0] = _mm_add_epi64(temp2[0], temp2[2]);  // [1]*cospi_8 + [3]*cospi_24
+  temp2[1] = _mm_add_epi64(temp2[1], temp2[3]);  // [1]*cospi_8 + [3]*cospi_24
+  step[2] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+  step[3] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+  // stage 2
+  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
+  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
+  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
+  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
+}
+
+static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
+  __m128i step[4];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]);
+  highbd_butterfly_sse2(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
+                        &step[3]);
+
+  // stage 2
+  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
+  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
+  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
+  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
+}
+
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                    int stride, int bd) {
+  int16_t max = 0, min = 0;
+  __m128i io[4], io_short[2];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 8));
+  io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+  io_short[0] = _mm_packs_epi32(io[0], io[1]);
+  io_short[1] = _mm_packs_epi32(io[2], io[3]);
+
+  if (bd != 8) {
+    __m128i max_input, min_input;
+
+    max_input = _mm_max_epi16(io_short[0], io_short[1]);
+    min_input = _mm_min_epi16(io_short[0], io_short[1]);
+    max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 8));
+    min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 8));
+    max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 4));
+    min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 4));
+    max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 2));
+    min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 2));
+    max = _mm_extract_epi16(max_input, 0);
+    min = _mm_extract_epi16(min_input, 0);
+  }
+
+  if (bd == 8 || (max < 4096 && min >= -4096)) {
+    idct4_sse2(io_short);
+    idct4_sse2(io_short);
+    io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+    io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+    io[0] = _mm_srai_epi16(io_short[0], 4);
+    io[1] = _mm_srai_epi16(io_short[1], 4);
+  } else {
+    if (max < 32767 && min > -32768) {
+      highbd_idct4_small_sse2(io);
+      highbd_idct4_small_sse2(io);
+    } else {
+      highbd_idct4_large_sse2(io);
+      highbd_idct4_large_sse2(io);
+    }
+    io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+    io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+  }
+
+  recon_and_store_4x4(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int bd) {
+  int a1, i;
+  tran_low_t out;
+  __m128i dc, d;
+
+  out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+  dc = _mm_set1_epi16(a1);
+
+  for (i = 0; i < 4; ++i) {
+    d = _mm_loadl_epi64((const __m128i *)dest);
+    d = add_clamp(d, dc, bd);
+    _mm_storel_epi64((__m128i *)dest, d);
+    dest += stride;
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
new file mode 100644
index 0000000000..38e64f3bc9
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -0,0 +1,69 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static INLINE void highbd_idct4(__m128i *const io) {
+  __m128i temp[2], step[4];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  temp[0] = _mm_add_epi32(io[0], io[2]);  // input[0] + input[2]
+  extend_64bit(temp[0], temp);
+  step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+  temp[0] = _mm_sub_epi32(io[0], io[2]);  // input[0] - input[2]
+  extend_64bit(temp[0], temp);
+  step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+  highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
+                          &step[3]);
+
+  // stage 2
+  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
+  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
+  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
+  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
+}
+
+void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  __m128i io[4];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 8));
+  io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+  if (bd == 8) {
+    __m128i io_short[2];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[1]);
+    io_short[1] = _mm_packs_epi32(io[2], io[3]);
+    idct4_sse2(io_short);
+    idct4_sse2(io_short);
+    io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+    io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+    io[0] = _mm_srai_epi16(io_short[0], 4);
+    io[1] = _mm_srai_epi16(io_short[1], 4);
+  } else {
+    highbd_idct4(io);
+    highbd_idct4(io);
+    io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+    io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+  }
+
+  recon_and_store_4x4(io, dest, stride, bd);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
new file mode 100644
index 0000000000..909a6b7948
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -0,0 +1,213 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static void highbd_idct8x8_half1d(__m128i *const io) {
+  __m128i step1[8], step2[8];
+
+  transpose_32bit_4x4x2(io, io);
+
+  // stage 1
+  step1[0] = io[0];
+  step1[2] = io[4];
+  step1[1] = io[2];
+  step1[3] = io[6];
+  highbd_butterfly_sse2(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4],
+                        &step1[7]);
+  highbd_butterfly_sse2(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5],
+                        &step1[6]);
+
+  // stage 2
+  highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]);
+  highbd_butterfly_sse2(step1[1], step1[3], cospi_24_64, cospi_8_64, &step2[2],
+                        &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
+  step1[7] = step2[7];
+
+  // stage 4
+  highbd_idct8_stage4(step1, io);
+}
+
+static void highbd_idct8x8_12_half1d(__m128i *const io) {
+  __m128i temp1[4], sign[2], step1[8], step2[8];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  step1[0] = io[0];
+  step1[1] = io[2];
+  abs_extend_64bit_sse2(io[1], temp1, sign);
+  step1[4] = multiplication_round_shift_sse2(temp1, sign, cospi_28_64);
+  step1[7] = multiplication_round_shift_sse2(temp1, sign, cospi_4_64);
+  abs_extend_64bit_sse2(io[3], temp1, sign);
+  step1[5] = multiplication_neg_round_shift_sse2(temp1, sign, cospi_20_64);
+  step1[6] = multiplication_round_shift_sse2(temp1, sign, cospi_12_64);
+
+  // stage 2
+  abs_extend_64bit_sse2(step1[0], temp1, sign);
+  step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+  abs_extend_64bit_sse2(step1[1], temp1, sign);
+  step2[2] = multiplication_round_shift_sse2(temp1, sign, cospi_24_64);
+  step2[3] = multiplication_round_shift_sse2(temp1, sign, cospi_8_64);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
+  step1[7] = step2[7];
+
+  // stage 4
+  highbd_idct8_stage4(step1, io);
+}
+
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                    int stride, int bd) {
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+  io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[4]);
+    io_short[1] = _mm_packs_epi32(io[1], io[5]);
+    io_short[2] = _mm_packs_epi32(io[2], io[6]);
+    io_short[3] = _mm_packs_epi32(io[3], io[7]);
+    io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+    io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+    io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+    io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+    io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+    io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+    io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+    io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+    io_short[4] = _mm_packs_epi32(io[8], io[12]);
+    io_short[5] = _mm_packs_epi32(io[9], io[13]);
+    io_short[6] = _mm_packs_epi32(io[10], io[14]);
+    io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+    idct8_sse2(io_short);
+    idct8_sse2(io_short);
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    highbd_idct8x8_half1d(io);
+
+    io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+    io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+    io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+    io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+    io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+    io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+    io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+    io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+    highbd_idct8x8_half1d(&io[8]);
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    io[4] = io[8];
+    io[5] = io[9];
+    io[6] = io[10];
+    io[7] = io[11];
+    highbd_idct8x8_half1d(io);
+
+    io[8] = temp[0];
+    io[9] = temp[1];
+    io[10] = temp[2];
+    io[11] = temp[3];
+    highbd_idct8x8_half1d(&io[8]);
+
+    highbd_idct8x8_final_round(io);
+  }
+
+  recon_and_store_8x8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                    int stride, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], zero);
+    io_short[1] = _mm_packs_epi32(io[1], zero);
+    io_short[2] = _mm_packs_epi32(io[2], zero);
+    io_short[3] = _mm_packs_epi32(io[3], zero);
+
+    idct8x8_12_add_kernel_sse2(io_short);
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    highbd_idct8x8_12_half1d(io);
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    highbd_idct8x8_12_half1d(io);
+
+    io[8] = temp[0];
+    io[9] = temp[1];
+    io[10] = temp[2];
+    io[11] = temp[3];
+    highbd_idct8x8_12_half1d(&io[8]);
+
+    highbd_idct8x8_final_round(io);
+  }
+
+  recon_and_store_8x8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int bd) {
+  highbd_idct_1_add_kernel(input, dest, stride, bd, 8);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
new file mode 100644
index 0000000000..ae391b2c02
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -0,0 +1,210 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static void highbd_idct8x8_half1d(__m128i *const io) {
+  __m128i step1[8], step2[8];
+
+  transpose_32bit_4x4x2(io, io);
+
+  // stage 1
+  step1[0] = io[0];
+  step1[2] = io[4];
+  step1[1] = io[2];
+  step1[3] = io[6];
+  highbd_butterfly_sse4_1(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4],
+                          &step1[7]);
+  highbd_butterfly_sse4_1(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5],
+                          &step1[6]);
+
+  // stage 2
+  highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]);
+  highbd_butterfly_sse4_1(step1[1], step1[3], cospi_24_64, cospi_8_64,
+                          &step2[2], &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
+  step1[7] = step2[7];
+
+  // stage 4
+  highbd_idct8_stage4(step1, io);
+}
+
+static void highbd_idct8x8_12_half1d(__m128i *const io) {
+  __m128i temp1[2], step1[8], step2[8];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  step1[0] = io[0];
+  step1[1] = io[2];
+  extend_64bit(io[1], temp1);
+  step1[4] = multiplication_round_shift_sse4_1(temp1, cospi_28_64);
+  step1[7] = multiplication_round_shift_sse4_1(temp1, cospi_4_64);
+  extend_64bit(io[3], temp1);
+  step1[5] = multiplication_round_shift_sse4_1(temp1, -cospi_20_64);
+  step1[6] = multiplication_round_shift_sse4_1(temp1, cospi_12_64);
+
+  // stage 2
+  extend_64bit(step1[0], temp1);
+  step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+  extend_64bit(step1[1], temp1);
+  step2[2] = multiplication_round_shift_sse4_1(temp1, cospi_24_64);
+  step2[3] = multiplication_round_shift_sse4_1(temp1, cospi_8_64);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
+  step1[7] = step2[7];
+
+  // stage 4
+  highbd_idct8_stage4(step1, io);
+}
+
+void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+  io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[4]);
+    io_short[1] = _mm_packs_epi32(io[1], io[5]);
+    io_short[2] = _mm_packs_epi32(io[2], io[6]);
+    io_short[3] = _mm_packs_epi32(io[3], io[7]);
+    io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+    io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+    io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+    io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+    io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+    io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+    io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+    io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+    io_short[4] = _mm_packs_epi32(io[8], io[12]);
+    io_short[5] = _mm_packs_epi32(io[9], io[13]);
+    io_short[6] = _mm_packs_epi32(io[10], io[14]);
+    io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+    idct8_sse2(io_short);
+    idct8_sse2(io_short);
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    highbd_idct8x8_half1d(io);
+
+    io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+    io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+    io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+    io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+    io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+    io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+    io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+    io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+    highbd_idct8x8_half1d(&io[8]);
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    io[4] = io[8];
+    io[5] = io[9];
+    io[6] = io[10];
+    io[7] = io[11];
+    highbd_idct8x8_half1d(io);
+
+    io[8] = temp[0];
+    io[9] = temp[1];
+    io[10] = temp[2];
+    io[11] = temp[3];
+    highbd_idct8x8_half1d(&io[8]);
+
+    highbd_idct8x8_final_round(io);
+  }
+
+  recon_and_store_8x8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], zero);
+    io_short[1] = _mm_packs_epi32(io[1], zero);
+    io_short[2] = _mm_packs_epi32(io[2], zero);
+    io_short[3] = _mm_packs_epi32(io[3], zero);
+
+    idct8x8_12_add_kernel_ssse3(io_short);
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    highbd_idct8x8_12_half1d(io);
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    highbd_idct8x8_12_half1d(io);
+
+    io[8] = temp[0];
+    io[9] = temp[1];
+    io[10] = temp[2];
+    io[11] = temp[3];
+    highbd_idct8x8_12_half1d(&io[8]);
+
+    highbd_idct8x8_final_round(io);
+  }
+
+  recon_and_store_8x8(io, dest, stride, bd);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
new file mode 100644
index 0000000000..2051381aa8
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
@@ -0,0 +1,533 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
+}
+
+static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpacklo_epi64(*row, *row);
+  _mm_store_si128((__m128i *)*dst, val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  *dst += stride;
+}
+
+static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpackhi_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  *dst += stride;
+}
+
+void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+    const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+    const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+    const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+    const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+    const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+    const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+    const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+    const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+    h_store_16_unpacklo(&dst, stride, &row0);
+    h_store_16_unpacklo(&dst, stride, &row1);
+    h_store_16_unpacklo(&dst, stride, &row2);
+    h_store_16_unpacklo(&dst, stride, &row3);
+    h_store_16_unpackhi(&dst, stride, &row4);
+    h_store_16_unpackhi(&dst, stride, &row5);
+    h_store_16_unpackhi(&dst, stride, &row6);
+    h_store_16_unpackhi(&dst, stride, &row7);
+  }
+}
+
+static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpacklo_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  _mm_store_si128((__m128i *)(*dst + 16), val);
+  _mm_store_si128((__m128i *)(*dst + 24), val);
+  *dst += stride;
+}
+
+static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpackhi_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  _mm_store_si128((__m128i *)(*dst + 16), val);
+  _mm_store_si128((__m128i *)(*dst + 24), val);
+  *dst += stride;
+}
+
+void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+    const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+    const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+    const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+    const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+    const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+    const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+    const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+    const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+    h_store_32_unpacklo(&dst, stride, &row0);
+    h_store_32_unpacklo(&dst, stride, &row1);
+    h_store_32_unpacklo(&dst, stride, &row2);
+    h_store_32_unpacklo(&dst, stride, &row3);
+    h_store_32_unpackhi(&dst, stride, &row4);
+    h_store_32_unpackhi(&dst, stride, &row5);
+    h_store_32_unpackhi(&dst, stride, &row6);
+    h_store_32_unpackhi(&dst, stride, &row7);
+  }
+}
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE __m128i dc_sum_4(const uint16_t *ref) {
+  const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
+  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+                                const __m128i *dc) {
+  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    _mm_storel_epi64((__m128i *)dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)above;
+  (void)bd;
+  dc_store_4x4(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)left;
+  (void)bd;
+  dc_store_4x4(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_4x4(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE __m128i dc_sum_8(const uint16_t *ref) {
+  const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
+  const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
+  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+
+  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
+                                const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < 8; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)above;
+  (void)bd;
+  dc_store_8x8(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)left;
+  (void)bd;
+  dc_store_8x8(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_8x8(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE __m128i dc_sum_16(const uint16_t *ref) {
+  const __m128i sum_lo = dc_sum_8(ref);
+  const __m128i sum_hi = dc_sum_8(ref + 8);
+  return _mm_add_epi16(sum_lo, sum_hi);
+}
+
+static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
+                                  const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < 16; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+  }
+}
+
+void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)above;
+  (void)bd;
+  dc_store_16x16(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)left;
+  (void)bd;
+  dc_store_16x16(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_16x16(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE __m128i dc_sum_32(const uint16_t *ref) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sum_a = dc_sum_16(ref);
+  const __m128i sum_b = dc_sum_16(ref + 16);
+  // 12 bit bd will outrange, so expand to 32 bit before adding final total
+  return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
+                       _mm_unpacklo_epi16(sum_b, zero));
+}
+
+static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
+                                  const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < 32; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+    _mm_store_si128((__m128i *)(dst + 16), dc_dup);
+    _mm_store_si128((__m128i *)(dst + 24), dc_dup);
+  }
+}
+
+void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(left);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)above;
+  (void)bd;
+  dc_store_32x32(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(above);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)left;
+  (void)bd;
+  dc_store_32x32(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_32x32(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+                                 const __m128i *z) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a = _mm_avg_epu16(*x, *z);
+  const __m128i b =
+      _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+  return _mm_avg_epu16(b, *y);
+}
+
+void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
+  const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
+  const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
+  const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
+  const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
+  const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4);
+  const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0);
+  const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00);
+  const __m128i row0 = _mm_srli_si128(avg2, 6);
+  const __m128i row1 = _mm_srli_si128(avg3, 4);
+  const __m128i row2 = _mm_srli_si128(avg2, 4);
+  const __m128i row3 = _mm_srli_si128(avg3, 2);
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+
+  dst -= stride;
+  dst[0] = _mm_extract_epi16(avg3, 1);
+  dst[stride] = _mm_extract_epi16(avg3, 0);
+}
+
+void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
+  const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
+  const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
+  const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
+  const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
+  const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0);
+  const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC);
+  const __m128i row0 = _mm_srli_si128(avg3, 6);
+  const __m128i row1 = _mm_srli_si128(avg3, 4);
+  const __m128i row2 = _mm_srli_si128(avg3, 2);
+  const __m128i row3 = avg3;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5));
+  const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
+  const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
+  const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
+  const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3);
+  const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2);
+  const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4);
+  const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00);
+  const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0);
+  const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3);
+  const __m128i row2 = _mm_srli_si128(row3, 4);
+  const __m128i row1 = _mm_srli_si128(row3, 8);
+  const __m128i row0 = _mm_srli_si128(avg3, 4);
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst[0] = _mm_extract_epi16(avg2, 3);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const __m128i IJKL0000 = _mm_load_si128((const __m128i *)left);
+  const __m128i LLLL0000 = _mm_shufflelo_epi16(IJKL0000, 0xff);
+  const __m128i IJKLLLLL = _mm_unpacklo_epi64(IJKL0000, LLLL0000);
+  const __m128i JKLLLLL0 = _mm_srli_si128(IJKLLLLL, 2);
+  const __m128i KLLLLL00 = _mm_srli_si128(IJKLLLLL, 4);
+  const __m128i avg3 = avg3_epu16(&IJKLLLLL, &JKLLLLL0, &KLLLLL00);
+  const __m128i avg2 = _mm_avg_epu16(IJKLLLLL, JKLLLLL0);
+  const __m128i row0 = _mm_unpacklo_epi16(avg2, avg3);
+  const __m128i row1 = _mm_srli_si128(row0, 4);
+  const __m128i row2 = _mm_srli_si128(row0, 8);
+  const __m128i row3 = LLLL0000;
+  (void)above;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGH0);
+  const __m128i row0 = avg2;
+  const __m128i row1 = avg3;
+  const __m128i row2 = _mm_srli_si128(avg2, 2);
+  const __m128i row3 = _mm_srli_si128(avg3, 2);
+  (void)left;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
new file mode 100644
index 0000000000..b9dcef205b
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
@@ -0,0 +1,930 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+                                 const __m128i *z) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a = _mm_avg_epu16(*x, *z);
+  const __m128i b =
+      _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+  return _mm_avg_epu16(b, *y);
+}
+
+void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+  (void)left;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, avg3);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
+  dst[3] = above[7];  // aka H
+}
+
+static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
+                               __m128i *row, const __m128i *ar) {
+  *row = _mm_alignr_epi8(*ar, *row, 2);
+  _mm_store_si128((__m128i *)*dst, *row);
+  *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+  const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+  const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+  const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+  const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+  __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+  (void)left;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, avg3);
+  dst += stride;
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+}
+
+static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
+                                __m128i *row_0, __m128i *row_1,
+                                const __m128i *ar) {
+  *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2);
+  *row_1 = _mm_alignr_epi8(*ar, *row_1, 2);
+  _mm_store_si128((__m128i *)*dst, *row_0);
+  _mm_store_si128((__m128i *)(*dst + 8), *row_1);
+  *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
+  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
+  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  (void)left;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, avg3_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+  dst += stride;
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+}
+
+void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+  const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
+  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+  const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+  const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
+  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  int i;
+  (void)left;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, avg3_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+  _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+  _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+  dst += stride;
+  for (i = 1; i < 32; ++i) {
+    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+    avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
+    avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
+    avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
+    _mm_store_si128((__m128i *)dst, avg3_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+    _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+    _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+    dst += stride;
+  }
+}
+
+DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = {
+  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1
+};
+
+static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
+  *a = _mm_shuffle_epi8(*a, *rotrw);
+  return *a;
+}
+
+void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+  const __m128i IXABCDEF =
+      _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
+  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
+  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
+  const __m128i XIJKLMNO =
+      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+  const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
+  __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
+  __m128i rowa = avg2;
+  __m128i rowb = avg3;
+  int i;
+  (void)bd;
+  for (i = 0; i < 8; i += 2) {
+    _mm_store_si128((__m128i *)dst, rowa);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, rowb);
+    dst += stride;
+    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+    rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
+  }
+}
+
+void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+  const __m128i L1_ = _mm_srli_si128(L1, 2);
+  __m128i rowa_0 = avg2_0;
+  __m128i rowa_1 = avg2_1;
+  __m128i rowb_0 = avg3_0;
+  __m128i rowb_1 = avg3_1;
+  __m128i avg3_left[2];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+  for (i = 0; i < 2; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; j += 2) {
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      dst += stride;
+      _mm_store_si128((__m128i *)dst, rowb_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+      dst += stride;
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+    }
+  }
+}
+
+void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
+  const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
+  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+  const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+  const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
+  const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+  const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
+  const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
+  const __m128i L3_ = _mm_srli_si128(L3, 2);
+  __m128i rowa_0 = avg2_0;
+  __m128i rowa_1 = avg2_1;
+  __m128i rowa_2 = avg2_2;
+  __m128i rowa_3 = avg2_3;
+  __m128i rowb_0 = avg3_0;
+  __m128i rowb_1 = avg3_1;
+  __m128i rowb_2 = avg3_2;
+  __m128i rowb_3 = avg3_3;
+  __m128i avg3_left[4];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+  avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
+  avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
+  for (i = 0; i < 4; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; j += 2) {
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+      dst += stride;
+      _mm_store_si128((__m128i *)dst, rowb_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+      _mm_store_si128((__m128i *)(dst + 16), rowb_2);
+      _mm_store_si128((__m128i *)(dst + 24), rowb_3);
+      dst += stride;
+      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
+      rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
+      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+    }
+  }
+}
+
+void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+  const __m128i XIJKLMNO =
+      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+  const __m128i AXIJKLMN =
+      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
+  const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
+  __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+  __m128i rowa = avg3;
+  int i;
+  (void)bd;
+  for (i = 0; i < 8; ++i) {
+    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+    _mm_store_si128((__m128i *)dst, rowa);
+    dst += stride;
+  }
+}
+
+void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i B0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+  const __m128i C1 = _mm_srli_si128(B1, 2);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+  __m128i rowa_0 = avg3_0;
+  __m128i rowa_1 = avg3_1;
+  __m128i avg3_left[2];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+  for (i = 0; i < 2; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; ++j) {
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      dst += stride;
+    }
+  }
+}
+
+void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+  const __m128i B0 = _mm_load_si128((const __m128i *)above);
+  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+  const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
+  const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
+  const __m128i C3 = _mm_srli_si128(B3, 2);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+  const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
+  const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
+  __m128i rowa_0 = avg3_0;
+  __m128i rowa_1 = avg3_1;
+  __m128i rowa_2 = avg3_2;
+  __m128i rowa_3 = avg3_3;
+  __m128i avg3_left[4];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+  avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
+  avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
+  for (i = 0; i < 4; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; ++j) {
+      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+      dst += stride;
+    }
+  }
+}
+
+void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
+  const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
+  const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
+  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+  const __m128i XIJKLMNO =
+      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+  const __m128i AXIJKLMN =
+      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
+  const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+  const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
+  const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
+  const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
+  const __m128i row0 =
+      _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
+  const __m128i row1 =
+      _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
+  const __m128i row2 =
+      _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
+  const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
+  const __m128i row4 =
+      _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
+  const __m128i row5 =
+      _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
+  const __m128i row6 =
+      _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
+  const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, row0);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row1);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row2);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row3);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row4);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row5);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row6);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row7);
+}
+
+void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_srli_si128(A1, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_srli_si128(A1, 4);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+  __m128i row_0 = avg3_0;
+  __m128i row_1 = avg3_1;
+  __m128i avg2_avg3_left[2][2];
+  int i, j;
+  (void)bd;
+
+  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+
+  for (j = 0; j < 2; ++j) {
+    for (i = 0; i < 2; ++i) {
+      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+    }
+  }
+}
+
+void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+  const __m128i B3 = _mm_srli_si128(A3, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+  const __m128i C3 = _mm_srli_si128(A3, 4);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+  const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
+  const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
+  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+  const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
+  const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
+  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+  const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
+  const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
+  __m128i row_0 = avg3_0;
+  __m128i row_1 = avg3_1;
+  __m128i row_2 = avg3_2;
+  __m128i row_3 = avg3_3;
+  __m128i avg2_avg3_left[4][2];
+  int i, j;
+  (void)bd;
+
+  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+  avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
+  avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
+  avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
+  avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
+
+  for (j = 0; j < 4; ++j) {
+    for (i = 0; i < 2; ++i) {
+      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+    }
+  }
+}
+
+static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride,
+                                  const __m128i *a, const __m128i *b) {
+  _mm_store_si128((__m128i *)*dst, *a);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+  *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left);
+  const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+  const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+  const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+  const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
+  const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3);
+  const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3);
+  (void)above;
+  (void)bd;
+  d207_store_4x8(&dst, stride, &out_a, &out_b);
+  d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH);
+}
+
+static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride,
+                                   const __m128i *a, const __m128i *b,
+                                   const __m128i *c) {
+  _mm_store_si128((__m128i *)*dst, *a);
+  _mm_store_si128((__m128i *)(*dst + 8), *b);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
+  *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)left);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff);
+  const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(LR, A1, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(LR, A1, 4);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
+  const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
+  const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
+  const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
+  (void)above;
+  (void)bd;
+  d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c);
+  d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d);
+  d207_store_4x16(&dst, stride, &out_c, &out_d, &LR);
+  d207_store_4x16(&dst, stride, &out_d, &LR, &LR);
+}
+
+static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride,
+                                   const __m128i *a, const __m128i *b,
+                                   const __m128i *c, const __m128i *d,
+                                   const __m128i *e) {
+  _mm_store_si128((__m128i *)*dst, *a);
+  _mm_store_si128((__m128i *)(*dst + 8), *b);
+  _mm_store_si128((__m128i *)(*dst + 16), *c);
+  _mm_store_si128((__m128i *)(*dst + 24), *d);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
+  _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4));
+  _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
+  _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8));
+  _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
+  _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12));
+  _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12));
+  *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)left);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff);
+  const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+  const __m128i B3 = _mm_alignr_epi8(LR, A3, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+  const __m128i C3 = _mm_alignr_epi8(LR, A3, 4);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+  const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+  const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
+  const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
+  const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
+  const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
+  const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2);
+  const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2);
+  const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3);
+  const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3);
+  (void)above;
+  (void)bd;
+  d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e);
+  d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f);
+  d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g);
+  d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h);
+  d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR);
+  d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR);
+  d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR);
+  d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR);
+}
+
+static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride,
+                                 __m128i *a, __m128i *b, const __m128i *ar) {
+  _mm_store_si128((__m128i *)*dst, *a);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, *b);
+  *dst += stride;
+  *a = _mm_alignr_epi8(*ar, *a, 2);
+  *b = _mm_alignr_epi8(*ar, *b, 2);
+  _mm_store_si128((__m128i *)*dst, *a);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, *b);
+  *dst += stride;
+  *a = _mm_alignr_epi8(*ar, *a, 2);
+  *b = _mm_alignr_epi8(*ar, *b, 2);
+}
+
+void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+  const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+  const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+  const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+  const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+  __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+  __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
+  (void)left;
+  (void)bd;
+  d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
+  d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
+}
+
+void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
+  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
+  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  int i;
+  (void)left;
+  (void)bd;
+  for (i = 0; i < 14; i += 2) {
+    _mm_store_si128((__m128i *)dst, avg2_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, avg3_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+    dst += stride;
+    avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
+    avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2);
+    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+    avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2);
+  }
+  _mm_store_si128((__m128i *)dst, avg2_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, avg3_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+}
+
+void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+  const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
+  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+  const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+  const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
+  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+  __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+  int i;
+  (void)left;
+  (void)bd;
+  for (i = 0; i < 30; i += 2) {
+    _mm_store_si128((__m128i *)dst, avg2_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+    _mm_store_si128((__m128i *)(dst + 16), avg2_2);
+    _mm_store_si128((__m128i *)(dst + 24), avg2_3);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, avg3_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+    _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+    _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+    dst += stride;
+    avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
+    avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2);
+    avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2);
+    avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2);
+    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+    avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
+    avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
+    avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
+  }
+  _mm_store_si128((__m128i *)dst, avg2_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+  _mm_store_si128((__m128i *)(dst + 16), avg2_2);
+  _mm_store_si128((__m128i *)(dst + 24), avg2_3);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, avg3_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+  _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+  _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
new file mode 100644
index 0000000000..e0f7495521
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -0,0 +1,400 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void extend_64bit(const __m128i in,
+                                __m128i *const out /*out[2]*/) {
+  out[0] = _mm_unpacklo_epi32(in, in);  // 0, 0, 1, 1
+  out[1] = _mm_unpackhi_epi32(in, in);  // 2, 2, 3, 3
+}
+
+static INLINE __m128i wraplow_16bit_shift4(const __m128i in0, const __m128i in1,
+                                           const __m128i rounding) {
+  __m128i temp[2];
+  temp[0] = _mm_add_epi32(in0, rounding);
+  temp[1] = _mm_add_epi32(in1, rounding);
+  temp[0] = _mm_srai_epi32(temp[0], 4);
+  temp[1] = _mm_srai_epi32(temp[1], 4);
+  return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1,
+                                           const __m128i rounding) {
+  __m128i temp[2];
+  temp[0] = _mm_add_epi32(in0, rounding);
+  temp[1] = _mm_add_epi32(in1, rounding);
+  temp[0] = _mm_srai_epi32(temp[0], 5);
+  temp[1] = _mm_srai_epi32(temp[1], 5);
+  return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) {
+  const __m128i t =
+      _mm_add_epi64(in, pair_set_epi32(DCT_CONST_ROUNDING << 2, 0));
+  return _mm_srli_si128(t, 2);
+}
+
+static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) {
+  const __m128i t0 = _mm_unpacklo_epi32(in0, in1);  // 0, 2
+  const __m128i t1 = _mm_unpackhi_epi32(in0, in1);  // 1, 3
+  return _mm_unpacklo_epi32(t0, t1);                // 0, 1, 2, 3
+}
+
+static INLINE void abs_extend_64bit_sse2(const __m128i in,
+                                         __m128i *const out /*out[2]*/,
+                                         __m128i *const sign /*sign[2]*/) {
+  sign[0] = _mm_srai_epi32(in, 31);
+  out[0] = _mm_xor_si128(in, sign[0]);
+  out[0] = _mm_sub_epi32(out[0], sign[0]);
+  sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]);  // 64-bit sign of 2, 3
+  sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]);  // 64-bit sign of 0, 1
+  out[1] = _mm_unpackhi_epi32(out[0], out[0]);     // 2, 3
+  out[0] = _mm_unpacklo_epi32(out[0], out[0]);     // 0, 1
+}
+
+// Note: cospi must be non negative.
+static INLINE __m128i multiply_apply_sign_sse2(const __m128i in,
+                                               const __m128i sign,
+                                               const __m128i cospi) {
+  __m128i out = _mm_mul_epu32(in, cospi);
+  out = _mm_xor_si128(out, sign);
+  return _mm_sub_epi64(out, sign);
+}
+
+// Note: c must be non negative.
+static INLINE __m128i multiplication_round_shift_sse2(
+    const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
+    const int c) {
+  const __m128i pair_c = pair_set_epi32(c << 2, 0);
+  __m128i t0, t1;
+
+  assert(c >= 0);
+  t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
+  t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
+  t0 = dct_const_round_shift_64bit(t0);
+  t1 = dct_const_round_shift_64bit(t1);
+
+  return pack_4(t0, t1);
+}
+
+// Note: c must be non negative.
+static INLINE __m128i multiplication_neg_round_shift_sse2(
+    const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
+    const int c) {
+  const __m128i pair_c = pair_set_epi32(c << 2, 0);
+  __m128i t0, t1;
+
+  assert(c >= 0);
+  t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
+  t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
+  t0 = _mm_sub_epi64(_mm_setzero_si128(), t0);
+  t1 = _mm_sub_epi64(_mm_setzero_si128(), t1);
+  t0 = dct_const_round_shift_64bit(t0);
+  t1 = dct_const_round_shift_64bit(t1);
+
+  return pack_4(t0, t1);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1,
+                                         const int c0, const int c1,
+                                         __m128i *const out0,
+                                         __m128i *const out1) {
+  const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0);
+  const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0);
+  __m128i temp1[4], temp2[4], sign1[2], sign2[2];
+
+  assert(c0 >= 0);
+  assert(c1 >= 0);
+  abs_extend_64bit_sse2(in0, temp1, sign1);
+  abs_extend_64bit_sse2(in1, temp2, sign2);
+  temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1);
+  temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c1);
+  temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c0);
+  temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c0);
+  temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c0);
+  temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c0);
+  temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c1);
+  temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c1);
+  temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
+  temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
+  temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
+  temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
+  temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+  temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+  temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+  temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+  *out0 = pack_4(temp1[0], temp1[1]);
+  *out1 = pack_4(temp2[0], temp2[1]);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_partial_butterfly_sse2(const __m128i in, const int c0,
+                                                 const int c1,
+                                                 __m128i *const out0,
+                                                 __m128i *const out1) {
+  __m128i temp[2], sign[2];
+
+  assert(c0 >= 0);
+  assert(c1 >= 0);
+  abs_extend_64bit_sse2(in, temp, sign);
+  *out0 = multiplication_round_shift_sse2(temp, sign, c0);
+  *out1 = multiplication_round_shift_sse2(temp, sign, c1);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_partial_butterfly_neg_sse2(const __m128i in,
+                                                     const int c0, const int c1,
+                                                     __m128i *const out0,
+                                                     __m128i *const out1) {
+  __m128i temp[2], sign[2];
+
+  assert(c0 >= 0);
+  assert(c1 >= 0);
+  abs_extend_64bit_sse2(in, temp, sign);
+  *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1);
+  *out1 = multiplication_round_shift_sse2(temp, sign, c0);
+}
+
+static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0,
+                                                 const __m128i in1,
+                                                 __m128i *const out0,
+                                                 __m128i *const out1) {
+  __m128i temp1[2], temp2, sign[2];
+
+  temp2 = _mm_add_epi32(in0, in1);
+  abs_extend_64bit_sse2(temp2, temp1, sign);
+  *out0 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+  temp2 = _mm_sub_epi32(in0, in1);
+  abs_extend_64bit_sse2(temp2, temp1, sign);
+  *out1 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void highbd_add_sub_butterfly(const __m128i *in, __m128i *out,
+                                            int size) {
+  int i = 0;
+  const int num = size >> 1;
+  const int bound = size - 1;
+  while (i < num) {
+    out[i] = _mm_add_epi32(in[i], in[bound - i]);
+    out[bound - i] = _mm_sub_epi32(in[i], in[bound - i]);
+    i++;
+  }
+}
+
+static INLINE void highbd_idct8_stage4(const __m128i *const in,
+                                       __m128i *const out) {
+  out[0] = _mm_add_epi32(in[0], in[7]);
+  out[1] = _mm_add_epi32(in[1], in[6]);
+  out[2] = _mm_add_epi32(in[2], in[5]);
+  out[3] = _mm_add_epi32(in[3], in[4]);
+  out[4] = _mm_sub_epi32(in[3], in[4]);
+  out[5] = _mm_sub_epi32(in[2], in[5]);
+  out[6] = _mm_sub_epi32(in[1], in[6]);
+  out[7] = _mm_sub_epi32(in[0], in[7]);
+}
+
+static INLINE void highbd_idct8x8_final_round(__m128i *const io) {
+  io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16));
+  io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16));
+  io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16));
+  io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16));
+  io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16));
+  io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16));
+  io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
+  io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
+}
+
+static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
+                                             __m128i *const out) {
+  out[0] = _mm_add_epi32(in[0], in[15]);
+  out[1] = _mm_add_epi32(in[1], in[14]);
+  out[2] = _mm_add_epi32(in[2], in[13]);
+  out[3] = _mm_add_epi32(in[3], in[12]);
+  out[4] = _mm_add_epi32(in[4], in[11]);
+  out[5] = _mm_add_epi32(in[5], in[10]);
+  out[6] = _mm_add_epi32(in[6], in[9]);
+  out[7] = _mm_add_epi32(in[7], in[8]);
+  out[8] = _mm_sub_epi32(in[7], in[8]);
+  out[9] = _mm_sub_epi32(in[6], in[9]);
+  out[10] = _mm_sub_epi32(in[5], in[10]);
+  out[11] = _mm_sub_epi32(in[4], in[11]);
+  out[12] = _mm_sub_epi32(in[3], in[12]);
+  out[13] = _mm_sub_epi32(in[2], in[13]);
+  out[14] = _mm_sub_epi32(in[1], in[14]);
+  out[15] = _mm_sub_epi32(in[0], in[15]);
+}
+
+static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
+                                const int bd) {
+  const __m128i zero = _mm_set1_epi16(0);
+  // Faster than _mm_set1_epi16((1 << bd) - 1).
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i d;
+
+  d = _mm_adds_epi16(in0, in1);
+  d = _mm_max_epi16(d, zero);
+  d = _mm_min_epi16(d, max);
+
+  return d;
+}
+
+static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
+                                            uint16_t *dest, int stride, int bd,
+                                            const int size) {
+  int a1, i, j;
+  tran_low_t out;
+  __m128i dc, d;
+
+  out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
+  a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6);
+  dc = _mm_set1_epi16(a1);
+
+  for (i = 0; i < size; ++i) {
+    for (j = 0; j < size; j += 8) {
+      d = _mm_load_si128((const __m128i *)(&dest[j]));
+      d = add_clamp(d, dc, bd);
+      _mm_store_si128((__m128i *)(&dest[j]), d);
+    }
+    dest += stride;
+  }
+}
+
+static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest,
+                                     const int bd) {
+  __m128i d;
+
+  d = _mm_loadl_epi64((const __m128i *)dest);
+  d = add_clamp(d, in, bd);
+  _mm_storel_epi64((__m128i *)dest, d);
+}
+
+static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest,
+                                       const int stride, const int bd) {
+  __m128i d;
+
+  d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
+  d = _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride)));
+  d = add_clamp(d, in, bd);
+  _mm_storel_epi64((__m128i *)(dest + 0 * stride), d);
+  _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d));
+}
+
+static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest,
+                                       const int stride, const int bd) {
+  recon_and_store_4x2(in[0], dest, stride, bd);
+  dest += 2 * stride;
+  recon_and_store_4x2(in[1], dest, stride, bd);
+}
+
+static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest,
+                                     const int stride, const int bd) {
+  __m128i d;
+
+  d = _mm_load_si128((const __m128i *)(*dest));
+  d = add_clamp(d, in, bd);
+  _mm_store_si128((__m128i *)(*dest), d);
+  *dest += stride;
+}
+
+static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest,
+                                       const int stride, const int bd) {
+  recon_and_store_8(in[0], &dest, stride, bd);
+  recon_and_store_8(in[1], &dest, stride, bd);
+  recon_and_store_8(in[2], &dest, stride, bd);
+  recon_and_store_8(in[3], &dest, stride, bd);
+  recon_and_store_8(in[4], &dest, stride, bd);
+  recon_and_store_8(in[5], &dest, stride, bd);
+  recon_and_store_8(in[6], &dest, stride, bd);
+  recon_and_store_8(in[7], &dest, stride, bd);
+}
+
+static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
+  const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
+  const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
+  return _mm_packs_epi32(t0, t1);
+}
+
+static INLINE void highbd_load_pack_transpose_32bit_8x8(const tran_low_t *input,
+                                                        const int stride,
+                                                        __m128i *const in) {
+  in[0] = load_pack_8_32bit(input + 0 * stride);
+  in[1] = load_pack_8_32bit(input + 1 * stride);
+  in[2] = load_pack_8_32bit(input + 2 * stride);
+  in[3] = load_pack_8_32bit(input + 3 * stride);
+  in[4] = load_pack_8_32bit(input + 4 * stride);
+  in[5] = load_pack_8_32bit(input + 5 * stride);
+  in[6] = load_pack_8_32bit(input + 6 * stride);
+  in[7] = load_pack_8_32bit(input + 7 * stride);
+  transpose_16bit_8x8(in, in);
+}
+
+static INLINE void highbd_load_transpose_32bit_8x4(const tran_low_t *input,
+                                                   const int stride,
+                                                   __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+  in[1] = _mm_load_si128((const __m128i *)(input + 0 * stride + 4));
+  in[2] = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+  in[3] = _mm_load_si128((const __m128i *)(input + 1 * stride + 4));
+  in[4] = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+  in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride + 4));
+  in[6] = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+  in[7] = _mm_load_si128((const __m128i *)(input + 3 * stride + 4));
+  transpose_32bit_8x4(in, in);
+}
+
+static INLINE void highbd_load_transpose_32bit_4x4(const tran_low_t *input,
+                                                   const int stride,
+                                                   __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  transpose_32bit_4x4(in, in);
+}
+
+static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in,
+                                         const int bd) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  __m128i out;
+
+  out = _mm_adds_epi16(in, final_rounding);
+  out = _mm_srai_epi16(out, 6);
+  recon_and_store_8(out, &dest, 0, bd);
+}
+
+static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in,
+                                         const int bd) {
+  const __m128i final_rounding = _mm_set1_epi32(1 << 5);
+  __m128i out;
+
+  out = _mm_add_epi32(in, final_rounding);
+  out = _mm_srai_epi32(out, 6);
+  out = _mm_packs_epi32(out, out);
+  recon_and_store_4(out, dest, bd);
+}
+
+#endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
new file mode 100644
index 0000000000..9c8eef40f7
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -0,0 +1,87 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_config.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+
+static INLINE __m128i multiplication_round_shift_sse4_1(
+    const __m128i *const in /*in[2]*/, const int c) {
+  const __m128i pair_c = pair_set_epi32(c * 4, 0);
+  __m128i t0, t1;
+
+  t0 = _mm_mul_epi32(in[0], pair_c);
+  t1 = _mm_mul_epi32(in[1], pair_c);
+  t0 = dct_const_round_shift_64bit(t0);
+  t1 = dct_const_round_shift_64bit(t1);
+
+  return pack_4(t0, t1);
+}
+
+static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1,
+                                           const int c0, const int c1,
+                                           __m128i *const out0,
+                                           __m128i *const out1) {
+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+  __m128i temp1[4], temp2[4];
+
+  extend_64bit(in0, temp1);
+  extend_64bit(in1, temp2);
+  temp1[2] = _mm_mul_epi32(temp1[0], pair_c1);
+  temp1[3] = _mm_mul_epi32(temp1[1], pair_c1);
+  temp1[0] = _mm_mul_epi32(temp1[0], pair_c0);
+  temp1[1] = _mm_mul_epi32(temp1[1], pair_c0);
+  temp2[2] = _mm_mul_epi32(temp2[0], pair_c0);
+  temp2[3] = _mm_mul_epi32(temp2[1], pair_c0);
+  temp2[0] = _mm_mul_epi32(temp2[0], pair_c1);
+  temp2[1] = _mm_mul_epi32(temp2[1], pair_c1);
+  temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
+  temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
+  temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
+  temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
+  temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+  temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+  temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+  temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+  *out0 = pack_4(temp1[0], temp1[1]);
+  *out1 = pack_4(temp2[0], temp2[1]);
+}
+
+static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0,
+                                                   const __m128i in1,
+                                                   __m128i *const out0,
+                                                   __m128i *const out1) {
+  __m128i temp1[2], temp2;
+
+  temp2 = _mm_add_epi32(in0, in1);
+  extend_64bit(temp2, temp1);
+  *out0 = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+  temp2 = _mm_sub_epi32(in0, in1);
+  extend_64bit(temp2, temp1);
+  *out1 = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+}
+
+static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in,
+                                                   const int c0, const int c1,
+                                                   __m128i *const out0,
+                                                   __m128i *const out1) {
+  __m128i temp[2];
+
+  extend_64bit(in, temp);
+  *out0 = multiplication_round_shift_sse4_1(temp, c0);
+  *out1 = multiplication_round_shift_sse4_1(temp, c1);
+}
+
+#endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
index 7d66411080..ec22db9f4c 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -12,7 +12,6 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
 
 static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
   __m128i ubounded;
@@ -48,10 +47,10 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
 
 // TODO(debargha, peter): Break up large functions into smaller ones
 // in this file.
-void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
-                                           const uint8_t *_blimit,
-                                           const uint8_t *_limit,
-                                           const uint8_t *_thresh, int bd) {
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
+                                       const uint8_t *_blimit,
+                                       const uint8_t *_limit,
+                                       const uint8_t *_thresh, int bd) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi16(1);
   __m128i blimit, limit, thresh;
@@ -475,12 +474,12 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
   _mm_store_si128((__m128i *)(s - 0 * p), q0);
 }
 
-void vpx_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p,
                                             const uint8_t *_blimit,
                                             const uint8_t *_limit,
                                             const uint8_t *_thresh, int bd) {
-  vpx_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd);
-  vpx_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
+  vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd);
+  vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
 }
 
 void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
@@ -1108,8 +1107,8 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
   highbd_transpose(src, p, dst, 8, 2);
 
   // Loop filtering
-  vpx_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
-                                        bd);
+  vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
+                                    bd);
   src[0] = t_dst;
   src[1] = t_dst + 8 * 8;
   dst[0] = s - 8;
@@ -1130,7 +1129,7 @@ void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
   highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
 
   //  Loop filtering
-  vpx_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit,
+  vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit,
                                          thresh, bd);
 
   //  Transpose back
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 64b56223ed..cedf98aff4 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <emmintrin.h>
 
 #include "vpx_dsp/vpx_dsp_common.h"
@@ -37,54 +38,54 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
 
   (void)scan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = ((int)count / 4) - 1; i >= 0; i--) {
-      __m128i coeffs, cmp1, cmp2;
-      int test;
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-      cmp1 = _mm_and_si128(cmp1, cmp2);
-      test = _mm_movemask_epi8(cmp1);
-      if (test == 0xffff)
-        non_zero_regs--;
-      else
-        break;
-    }
+  // Pre-scan pass
+  for (i = ((int)count / 4) - 1; i >= 0; i--) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (test == 0xffff)
+      non_zero_regs--;
+    else
+      break;
+  }
 
-    // Quantization pass:
-    for (i = 0; i < non_zero_regs; i++) {
-      __m128i coeffs, coeffs_sign, tmp1, tmp2;
-      int test;
-      int abs_coeff[4];
-      int coeff_sign[4];
+  // Quantization pass:
+  for (i = 0; i < non_zero_regs; i++) {
+    __m128i coeffs, coeffs_sign, tmp1, tmp2;
+    int test;
+    int abs_coeff[4];
+    int coeff_sign[4];
 
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      coeffs_sign = _mm_srai_epi32(coeffs, 31);
-      coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
-      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
-      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
-      tmp1 = _mm_or_si128(tmp1, tmp2);
-      test = _mm_movemask_epi8(tmp1);
-      _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
-      _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    coeffs_sign = _mm_srai_epi32(coeffs, 31);
+    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+    tmp1 = _mm_or_si128(tmp1, tmp2);
+    test = _mm_movemask_epi8(tmp1);
+    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
 
-      for (j = 0; j < 4; j++) {
-        if (test & (1 << (4 * j))) {
-          int k = 4 * i + j;
-          const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0];
-          const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1;
-          const uint32_t abs_qcoeff =
-              (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);
-          qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
-          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
-          if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
-        }
+    for (j = 0; j < 4; j++) {
+      if (test & (1 << (4 * j))) {
+        int k = 4 * i + j;
+        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
       }
     }
   }
@@ -105,6 +106,9 @@ void vpx_highbd_quantize_b_32x32_sse2(
   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
   (void)scan;
+  (void)skip_block;
+  assert(!skip_block);
+
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
   zbins[1] = _mm_set1_epi32(zbin1_tmp);
 
@@ -116,38 +120,35 @@ void vpx_highbd_quantize_b_32x32_sse2(
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs / 4; i++) {
-      __m128i coeffs, cmp1, cmp2;
-      int test;
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-      cmp1 = _mm_and_si128(cmp1, cmp2);
-      test = _mm_movemask_epi8(cmp1);
-      if (!(test & 0xf)) idx_arr[idx++] = i * 4;
-      if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
-      if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
-      if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
-    }
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs / 4; i++) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+  }
 
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = idx_arr[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 =
-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-      qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
-    }
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = idx_arr[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
   }
   *eob_ptr = eob + 1;
 }
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 30ee81b688..d9a6932e0b 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -72,7 +72,7 @@ SECTION .text
   paddd                m6, m4
   mov                  r1, ssem         ; r1 = unsigned int *sse
   movd               [r1], m7           ; store sse
-  movd                rax, m6           ; store sum as return value
+  movd                eax, m6           ; store sum as return value
 %endif
   RET
 %endmacro
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
index 923418a992..e646767e19 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;unsigned int vpx_highbd_calc16x16var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
index 414ae5de1a..a6f7c3d25d 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
@@ -135,7 +135,7 @@ HIGH_GET_VAR(8);
     highbd_8_variance_sse2(                                                \
         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
         vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    return *sse - (((int64_t)sum * sum) >> shift);                         \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
   }                                                                        \
                                                                            \
   uint32_t vpx_highbd_10_variance##w##x##h##_sse2(                         \
@@ -293,12 +293,13 @@ DECLS(sse2);
       }                                                                        \
     }                                                                          \
     *sse_ptr = sse;                                                            \
-    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
   }                                                                            \
                                                                                \
   uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
       const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+    int64_t var;                                                               \
     uint32_t sse;                                                              \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
@@ -328,7 +329,8 @@ DECLS(sse2);
     se = ROUND_POWER_OF_TWO(se, 2);                                            \
     sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
     *sse_ptr = sse;                                                            \
-    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
   }                                                                            \
                                                                                \
   uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
@@ -337,6 +339,7 @@ DECLS(sse2);
     int start_row;                                                             \
     uint32_t sse;                                                              \
     int se = 0;                                                                \
+    int64_t var;                                                               \
     uint64_t long_sse = 0;                                                     \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
@@ -375,7 +378,8 @@ DECLS(sse2);
     se = ROUND_POWER_OF_TWO(se, 4);                                            \
     sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
     *sse_ptr = sse;                                                            \
-    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
 #define FNS(opt)                        \
@@ -444,13 +448,14 @@ DECLS(sse2);
       }                                                                        \
     }                                                                          \
     *sse_ptr = sse;                                                            \
-    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
   }                                                                            \
                                                                                \
   uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
       const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
       const uint8_t *sec8) {                                                   \
+    int64_t var;                                                               \
     uint32_t sse;                                                              \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
@@ -481,7 +486,8 @@ DECLS(sse2);
     se = ROUND_POWER_OF_TWO(se, 2);                                            \
     sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
     *sse_ptr = sse;                                                            \
-    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
   }                                                                            \
                                                                                \
   uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
@@ -489,6 +495,7 @@ DECLS(sse2);
       const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
       const uint8_t *sec8) {                                                   \
     int start_row;                                                             \
+    int64_t var;                                                               \
     uint32_t sse;                                                              \
     int se = 0;                                                                \
     uint64_t long_sse = 0;                                                     \
@@ -530,7 +537,8 @@ DECLS(sse2);
     se = ROUND_POWER_OF_TWO(se, 4);                                            \
     sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
     *sse_ptr = sse;                                                            \
-    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
 #define FNS(opt1)                        \
diff --git a/libs/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/libs/libvpx/vpx_dsp/x86/intrapred_sse2.asm
index c18095c287..61af6236ed 100644
--- a/libs/libvpx/vpx_dsp/x86/intrapred_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/intrapred_sse2.asm
@@ -61,7 +61,7 @@ cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
   psrlq                m3, 8
   movd   [dstq+strideq  ], m3
   psrlq                m0, 56
-  movd              tempq, m0
+  movd              tempd, m0
   mov    [dstq+strideq+3], tempb
 
   RESTORE_GOT
diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
index cb56ad0789..f6e56b6f9e 100644
--- a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -8,169 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <emmintrin.h>  // SSE2
+
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
-#define RECON_AND_STORE4X4(dest, in_x)                    \
-  {                                                       \
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                     \
-    d0 = _mm_add_epi16(in_x, d0);                         \
-    d0 = _mm_packus_epi16(d0, d0);                        \
-    *(int *)(dest) = _mm_cvtsi128_si32(d0);               \
-  }
-
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i cst = _mm_setr_epi16(
-      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
-      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
-      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i input0, input1, input2, input3;
-
-  // Rows
-  input0 = load_input_data(input);
-  input2 = load_input_data(input + 8);
-
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_shufflelo_epi16(input0, 0xd8);
-  input0 = _mm_shufflehi_epi16(input0, 0xd8);
-  input2 = _mm_shufflelo_epi16(input2, 0xd8);
-  input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
-  input1 = _mm_unpackhi_epi32(input0, input0);
-  input0 = _mm_unpacklo_epi32(input0, input0);
-  input3 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpacklo_epi32(input2, input2);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input1);
-  input1 = _mm_packs_epi32(input2, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Columns
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_unpacklo_epi32(input2, input2);
-  input1 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpackhi_epi32(input3, input3);
-  input3 = _mm_unpacklo_epi32(input3, input3);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input2);
-  input1 = _mm_packs_epi32(input1, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Final round and shift
-  input2 = _mm_add_epi16(input2, eight);
-  input3 = _mm_add_epi16(input3, eight);
-
-  input2 = _mm_srai_epi16(input2, 4);
-  input3 = _mm_srai_epi16(input3, 4);
-
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, input2);
-    d2 = _mm_add_epi16(d2, input3);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store input0
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store input1
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store input2
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-    // store input3
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-  }
-}
-
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 4);
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
+static INLINE void transpose_16bit_4(__m128i *res) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
 
@@ -178,35 +23,75 @@ static INLINE void transpose_4x4(__m128i *res) {
   res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
 }
 
-void idct4_sse2(__m128i *in) {
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  const __m128i eight = _mm_set1_epi16(8);
+  __m128i in[2];
+
+  // Rows
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8);
+  idct4_sse2(in);
+
+  // Columns
+  idct4_sse2(in);
+
+  // Final round and shift
+  in[0] = _mm_add_epi16(in[0], eight);
+  in[1] = _mm_add_epi16(in[1], eight);
+  in[0] = _mm_srai_epi16(in[0], 4);
+  in[1] = _mm_srai_epi16(in[1], 4);
+
+  recon_and_store4x4_sse2(in, dest, stride);
+}
+
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  int a;
+  __m128i dc_value, d[2];
+
+  a = (int)dct_const_round_shift((int16_t)input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 4);
+
+  dc_value = _mm_set1_epi16(a);
+
+  // Reconstruction and Store
+  d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
+  d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+  d[0] = _mm_unpacklo_epi32(d[0],
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+  d[1] = _mm_unpacklo_epi32(
+      _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
+  d[0] = _mm_unpacklo_epi8(d[0], zero);
+  d[1] = _mm_unpacklo_epi8(d[1], zero);
+  d[0] = _mm_add_epi16(d[0], dc_value);
+  d[1] = _mm_add_epi16(d[1], dc_value);
+  d[0] = _mm_packus_epi16(d[0], d[1]);
+
+  *(int *)dest = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
+}
+
+void idct4_sse2(__m128i *const in) {
   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
+  __m128i u[2];
 
-  transpose_4x4(in);
+  transpose_16bit_4(in);
   // stage 1
   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[3], v[2]);
+  u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]);
+  u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]);
 
   // stage 2
   in[0] = _mm_add_epi16(u[0], u[1]);
@@ -214,7 +99,7 @@ void idct4_sse2(__m128i *in) {
   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
 }
 
-void iadst4_sse2(__m128i *in) {
+void iadst4_sse2(__m128i *const in) {
   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
@@ -224,7 +109,7 @@ void iadst4_sse2(__m128i *in) {
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8], in7;
 
-  transpose_4x4(in);
+  transpose_16bit_4(in);
   in7 = _mm_srli_si128(in[1], 8);
   in7 = _mm_add_epi16(in7, in[0]);
   in7 = _mm_sub_epi16(in7, in[1]);
@@ -263,309 +148,93 @@ void iadst4_sse2(__m128i *in) {
   in[1] = _mm_packs_epi32(u[2], u[3]);
 }
 
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);                   \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);                   \
-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1);                   \
-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3);                   \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5);                   \
-    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7);                   \
-    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5);                   \
-    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7);                   \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);               \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                              \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                              \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                              \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                              \
-    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5);                              \
-    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5);                              \
-    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7);                              \
-    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7);                              \
-  }
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
-  {                                                                      \
-    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1);                \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0);                \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3);                \
-    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2);                \
-                                                                         \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);              \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);              \
-                                                                         \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                             \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                             \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                             \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                             \
-  }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
-  {                                                      \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);  \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);  \
-    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1);             \
-    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1);             \
-  }
-
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
-                               res0, res1, res2, res3)                         \
-  {                                                                            \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                                         \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                                         \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                                         \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                                         \
-    tmp4 = _mm_madd_epi16(lo_1, cst2);                                         \
-    tmp5 = _mm_madd_epi16(hi_1, cst2);                                         \
-    tmp6 = _mm_madd_epi16(lo_1, cst3);                                         \
-    tmp7 = _mm_madd_epi16(hi_1, cst3);                                         \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-    tmp4 = _mm_add_epi32(tmp4, rounding);                                      \
-    tmp5 = _mm_add_epi32(tmp5, rounding);                                      \
-    tmp6 = _mm_add_epi32(tmp6, rounding);                                      \
-    tmp7 = _mm_add_epi32(tmp7, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);                               \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);                               \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);                               \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);                               \
-                                                                               \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                                        \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                                        \
-    res2 = _mm_packs_epi32(tmp4, tmp5);                                        \
-    res3 = _mm_packs_epi32(tmp6, tmp7);                                        \
-  }
-
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
-  {                                                                  \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                               \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                               \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                               \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                               \
-                                                                     \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                            \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                            \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                            \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                            \
-                                                                     \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                     \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                     \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                     \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                     \
-                                                                     \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                              \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                              \
-  }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
-              out4, out5, out6, out7)                                         \
-  {                                                                           \
-    /* Stage1 */                                                              \
-    {                                                                         \
-      const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);                     \
-      const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);                     \
-      const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);                     \
-      const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1,      \
-                             stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6)  \
-    }                                                                         \
-                                                                              \
-    /* Stage2 */                                                              \
-    {                                                                         \
-      const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);                     \
-      const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);                     \
-      const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);                     \
-      const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
-                             stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
-                                                                              \
-      stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);                                \
-      stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);                                \
-      stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);                                \
-      stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);                                \
-    }                                                                         \
-                                                                              \
-    /* Stage3 */                                                              \
-    {                                                                         \
-      const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
-      const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
-                                                                              \
-      stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);                                \
-      stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);                                \
-      stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);                                \
-      stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);                                \
-                                                                              \
-      tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
-      tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
-      tmp2 = _mm_madd_epi16(lo_56, stg2_0);                                   \
-      tmp3 = _mm_madd_epi16(hi_56, stg2_0);                                   \
-                                                                              \
-      tmp0 = _mm_add_epi32(tmp0, rounding);                                   \
-      tmp1 = _mm_add_epi32(tmp1, rounding);                                   \
-      tmp2 = _mm_add_epi32(tmp2, rounding);                                   \
-      tmp3 = _mm_add_epi32(tmp3, rounding);                                   \
-                                                                              \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                            \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                            \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                            \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                            \
-                                                                              \
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                   \
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                   \
-    }                                                                         \
-                                                                              \
-    /* Stage4  */                                                             \
-    out0 = _mm_adds_epi16(stp1_0, stp2_7);                                    \
-    out1 = _mm_adds_epi16(stp1_1, stp1_6);                                    \
-    out2 = _mm_adds_epi16(stp1_2, stp1_5);                                    \
-    out3 = _mm_adds_epi16(stp1_3, stp2_4);                                    \
-    out4 = _mm_subs_epi16(stp1_3, stp2_4);                                    \
-    out5 = _mm_subs_epi16(stp1_2, stp1_5);                                    \
-    out6 = _mm_subs_epi16(stp1_1, stp1_6);                                    \
-    out7 = _mm_subs_epi16(stp1_0, stp2_7);                                    \
-  }
+static INLINE void load_buffer_8x8(const tran_low_t *const input,
+                                   __m128i *const in) {
+  in[0] = load_input_data8(input + 0 * 8);
+  in[1] = load_input_data8(input + 1 * 8);
+  in[2] = load_input_data8(input + 2 * 8);
+  in[3] = load_input_data8(input + 3 * 8);
+  in[4] = load_input_data8(input + 4 * 8);
+  in[5] = load_input_data8(input + 5 * 8);
+  in[6] = load_input_data8(input + 6 * 8);
+  in[7] = load_input_data8(input + 7 * 8);
+}
 
 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
                              int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i in[8];
   int i;
 
   // Load input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-  in4 = load_input_data(input + 8 * 4);
-  in5 = load_input_data(input + 8 * 5);
-  in6 = load_input_data(input + 8 * 6);
-  in7 = load_input_data(input + 8 * 7);
+  load_buffer_8x8(input, in);
 
   // 2-D
   for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                  in4, in5, in6, in7);
-
-    // 4-stage 1D idct8x8
-    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
-          in6, in7);
+    idct8_sse2(in);
   }
 
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
+  write_buffer_8x8(in, dest, stride);
+}
 
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  __m128i io[8];
 
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
+  io[0] = load_input_data4(input + 0 * 8);
+  io[1] = load_input_data4(input + 1 * 8);
+  io[2] = load_input_data4(input + 2 * 8);
+  io[3] = load_input_data4(input + 3 * 8);
+
+  idct8x8_12_add_kernel_sse2(io);
+  write_buffer_8x8(io, dest, stride);
+}
+
+static INLINE void recon_and_store_8_dual(uint8_t *const dest,
+                                          const __m128i in_x,
+                                          const int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d0, d1;
+
+  d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride));
+  d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride));
+  d0 = _mm_unpacklo_epi8(d0, zero);
+  d1 = _mm_unpacklo_epi8(d1, zero);
+  d0 = _mm_add_epi16(in_x, d0);
+  d1 = _mm_add_epi16(in_x, d1);
+  d0 = _mm_packus_epi16(d0, d1);
+  _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0);
+  _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0));
 }
 
 void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
                             int stride) {
   __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
+  tran_high_t a1;
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 5);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  dc_value = _mm_set1_epi16((int16_t)a1);
 
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE(dest + 0 * stride, dc_value);
-  RECON_AND_STORE(dest + 1 * stride, dc_value);
-  RECON_AND_STORE(dest + 2 * stride, dc_value);
-  RECON_AND_STORE(dest + 3 * stride, dc_value);
-  RECON_AND_STORE(dest + 4 * stride, dc_value);
-  RECON_AND_STORE(dest + 5 * stride, dc_value);
-  RECON_AND_STORE(dest + 6 * stride, dc_value);
-  RECON_AND_STORE(dest + 7 * stride, dc_value);
+  recon_and_store_8_dual(dest, dc_value, stride);
+  dest += 2 * stride;
+  recon_and_store_8_dual(dest, dc_value, stride);
+  dest += 2 * stride;
+  recon_and_store_8_dual(dest, dc_value, stride);
+  dest += 2 * stride;
+  recon_and_store_8_dual(dest, dc_value, stride);
 }
 
-void idct8_sse2(__m128i *in) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
+void idct8_sse2(__m128i *const in) {
   // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
-                in1, in2, in3, in4, in5, in6, in7);
+  transpose_16bit_8x8(in, in);
 
   // 4-stage 1D idct8x8
-  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
-        in[4], in[5], in[6], in[7]);
+  idct8(in, in);
 }
 
-void iadst8_sse2(__m128i *in) {
+void iadst8_sse2(__m128i *const in) {
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -578,7 +247,7 @@ void iadst8_sse2(__m128i *in) {
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__const_0 = _mm_set1_epi16(0);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
@@ -589,7 +258,7 @@ void iadst8_sse2(__m128i *in) {
   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
 
   // transpose
-  array_transpose_8x8(in, in);
+  transpose_16bit_8x8(in, in);
 
   // properly aligned for butterfly input
   in0 = in[7];
@@ -751,37 +420,10 @@ void iadst8_sse2(__m128i *in) {
   u2 = _mm_unpacklo_epi16(s6, s7);
   u3 = _mm_unpackhi_epi16(s6, s7);
 
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
+  s2 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_p16);
+  s3 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_m16);
+  s6 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_p16);
+  s7 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_m16);
 
   in[0] = s0;
   in[1] = _mm_sub_epi16(k__const_0, s4);
@@ -793,521 +435,133 @@ void iadst8_sse2(__m128i *in) {
   in[7] = _mm_sub_epi16(k__const_0, s1);
 }
 
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // Rows. Load 4-row input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-  // Stage1
-  {
-    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
-    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
-  }
-
-  // Stage2
-  {
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
-    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
-    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
-    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
-
-    stp2_4 = tmp0;
-    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
-    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
-    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
-
-    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
-    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
-    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
-        in5, in6, in7);
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
+static INLINE void idct16_load8x8(const tran_low_t *const input,
+                                  __m128i *const in) {
+  in[0] = load_input_data8(input + 0 * 16);
+  in[1] = load_input_data8(input + 1 * 16);
+  in[2] = load_input_data8(input + 2 * 16);
+  in[3] = load_input_data8(input + 3 * 16);
+  in[4] = load_input_data8(input + 4 * 16);
+  in[5] = load_input_data8(input + 5 * 16);
+  in[6] = load_input_data8(input + 6 * 16);
+  in[7] = load_input_data8(input + 7 * 16);
 }
 
-#define IDCT16                                                                 \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]);                 \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]);                 \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);                   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);                   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]);                 \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]);                 \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]);                 \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1,   \
-                           stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
-                           stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]);                 \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]);                 \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]);                 \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
-                           stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)     \
-                                                                               \
-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-                                                                               \
-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);                   \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]);                   \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]);                 \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]);                 \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1,   \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
-
-#define IDCT16_10                                                              \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero);                   \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
-                           stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11,         \
-                           stp1_12_0)                                          \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
-                                                                               \
-    stp1_9 = stp1_8_0;                                                         \
-    stp1_10 = stp1_11;                                                         \
-                                                                               \
-    stp1_13 = stp1_12_0;                                                       \
-    stp1_14 = stp1_15;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);                    \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero);                    \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1)   \
-    stp2_5 = stp2_4;                                                           \
-    stp2_6 = stp2_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_2 = stp1_1;                                                           \
-    stp1_3 = stp1_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
-
 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
                                 int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[16], l[16], r[16], *curr1;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i l[16], r[16], out[16], *in;
   int i;
 
-  curr1 = l;
+  in = l;
   for (i = 0; i < 2; i++) {
-    // 1-D idct
-
-    // Load input data.
-    in[0] = load_input_data(input);
-    in[8] = load_input_data(input + 8 * 1);
-    in[1] = load_input_data(input + 8 * 2);
-    in[9] = load_input_data(input + 8 * 3);
-    in[2] = load_input_data(input + 8 * 4);
-    in[10] = load_input_data(input + 8 * 5);
-    in[3] = load_input_data(input + 8 * 6);
-    in[11] = load_input_data(input + 8 * 7);
-    in[4] = load_input_data(input + 8 * 8);
-    in[12] = load_input_data(input + 8 * 9);
-    in[5] = load_input_data(input + 8 * 10);
-    in[13] = load_input_data(input + 8 * 11);
-    in[6] = load_input_data(input + 8 * 12);
-    in[14] = load_input_data(input + 8 * 13);
-    in[7] = load_input_data(input + 8 * 14);
-    in[15] = load_input_data(input + 8 * 15);
-
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-
-    IDCT16
-
-    // Stage7
-    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
-    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
-    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
-    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
-    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
-    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
-    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
-    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
-    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    curr1 = r;
+    idct16_load8x8(input, in);
+    transpose_16bit_8x8(in, in);
+    idct16_load8x8(input + 8, in + 8);
+    transpose_16bit_8x8(in + 8, in + 8);
+    idct16_8col(in, in);
+    in = r;
     input += 128;
   }
-  for (i = 0; i < 2; i++) {
+
+  for (i = 0; i < 16; i += 8) {
     int j;
-    // 1-D idct
-    array_transpose_8x8(l + i * 8, in);
-    array_transpose_8x8(r + i * 8, in + 8);
-
-    IDCT16
-
-    // 2-D
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+    transpose_16bit_8x8(l + i, out);
+    transpose_16bit_8x8(r + i, out + 8);
+    idct16_8col(out, out);
 
     for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
+      write_buffer_8x1(dest + j * stride, out[j]);
     }
 
     dest += 8;
   }
 }
 
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  __m128i in[16], temp[16], out[16];
+  int i;
+
+  idct16_load8x8(input, in);
+  transpose_16bit_8x8(in, in);
+
+  for (i = 8; i < 16; i++) {
+    in[i] = _mm_setzero_si128();
+  }
+  idct16_8col(in, temp);
+
+  for (i = 0; i < 16; i += 8) {
+    int j;
+    transpose_16bit_8x8(temp + i, in);
+    idct16_8col(in, out);
+
+    for (j = 0; j < 16; ++j) {
+      write_buffer_8x1(dest + j * stride, out[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  __m128i in[16], l[16];
+  int i;
+
+  // First 1-D inverse DCT
+  // Load input data.
+  in[0] = load_input_data4(input + 0 * 16);
+  in[1] = load_input_data4(input + 1 * 16);
+  in[2] = load_input_data4(input + 2 * 16);
+  in[3] = load_input_data4(input + 3 * 16);
+
+  idct16x16_10_pass1(in, l);
+
+  // Second 1-D inverse transform, performed per 8x16 block
+  for (i = 0; i < 16; i += 8) {
+    int j;
+    idct16x16_10_pass2(l + i, in);
+
+    for (j = 0; j < 16; ++j) {
+      write_buffer_8x1(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d0, d1;
+
+  d0 = _mm_load_si128((__m128i *)(dest));
+  d1 = _mm_unpackhi_epi8(d0, zero);
+  d0 = _mm_unpacklo_epi8(d0, zero);
+  d0 = _mm_add_epi16(in_x, d0);
+  d1 = _mm_add_epi16(in_x, d1);
+  d0 = _mm_packus_epi16(d0, d1);
+  _mm_store_si128((__m128i *)(dest), d0);
+}
+
 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
                               int stride) {
   __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, i;
+  int i;
+  tran_high_t a1;
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  dc_value = _mm_set1_epi16((int16_t)a1);
 
   for (i = 0; i < 16; ++i) {
-    RECON_AND_STORE(dest + 0, dc_value);
-    RECON_AND_STORE(dest + 8, dc_value);
+    recon_and_store_16(dest, dc_value);
     dest += stride;
   }
 }
 
-static void iadst16_8col(__m128i *in) {
+static void iadst16_8col(__m128i *const in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -1335,8 +589,8 @@ static void iadst16_8col(__m128i *in) {
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -1708,1722 +962,371 @@ static void iadst16_8col(__m128i *in) {
   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
 
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+  in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16);
+  in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
+  in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
+  in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
+  in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16);
+  in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16);
+  in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16);
+  in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16);
 
   in[0] = s[0];
   in[1] = _mm_sub_epi16(kZero, s[8]);
   in[2] = s[12];
   in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
   in[12] = s[5];
   in[13] = _mm_sub_epi16(kZero, s[13]);
   in[14] = s[9];
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-static void idct16_8col(__m128i *in) {
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i v[16], u[16], s[16], t[16];
-
-  // stage 1
-  s[0] = in[0];
-  s[1] = in[8];
-  s[2] = in[4];
-  s[3] = in[12];
-  s[4] = in[2];
-  s[5] = in[10];
-  s[6] = in[6];
-  s[7] = in[14];
-  s[8] = in[1];
-  s[9] = in[9];
-  s[10] = in[5];
-  s[11] = in[13];
-  s[12] = in[3];
-  s[13] = in[11];
-  s[14] = in[7];
-  s[15] = in[15];
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[8] = _mm_packs_epi32(u[0], u[1]);
-  s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9] = _mm_packs_epi32(u[4], u[5]);
-  s[14] = _mm_packs_epi32(u[6], u[7]);
-  s[10] = _mm_packs_epi32(u[8], u[9]);
-  s[13] = _mm_packs_epi32(u[10], u[11]);
-  s[11] = _mm_packs_epi32(u[12], u[13]);
-  s[12] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  t[0] = s[0];
-  t[1] = s[1];
-  t[2] = s[2];
-  t[3] = s[3];
-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[4] = _mm_packs_epi32(u[0], u[1]);
-  t[7] = _mm_packs_epi32(u[2], u[3]);
-  t[5] = _mm_packs_epi32(u[4], u[5]);
-  t[6] = _mm_packs_epi32(u[6], u[7]);
-  t[8] = _mm_add_epi16(s[8], s[9]);
-  t[9] = _mm_sub_epi16(s[8], s[9]);
-  t[10] = _mm_sub_epi16(s[11], s[10]);
-  t[11] = _mm_add_epi16(s[10], s[11]);
-  t[12] = _mm_add_epi16(s[12], s[13]);
-  t[13] = _mm_sub_epi16(s[12], s[13]);
-  t[14] = _mm_sub_epi16(s[15], s[14]);
-  t[15] = _mm_add_epi16(s[14], s[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_add_epi16(t[4], t[5]);
-  s[5] = _mm_sub_epi16(t[4], t[5]);
-  s[6] = _mm_sub_epi16(t[7], t[6]);
-  s[7] = _mm_add_epi16(t[6], t[7]);
-  s[8] = t[8];
-  s[15] = t[15];
-  s[9] = _mm_packs_epi32(u[8], u[9]);
-  s[14] = _mm_packs_epi32(u[10], u[11]);
-  s[10] = _mm_packs_epi32(u[12], u[13]);
-  s[13] = _mm_packs_epi32(u[14], u[15]);
-  s[11] = t[11];
-  s[12] = t[12];
-
-  // stage 5
-  t[0] = _mm_add_epi16(s[0], s[3]);
-  t[1] = _mm_add_epi16(s[1], s[2]);
-  t[2] = _mm_sub_epi16(s[1], s[2]);
-  t[3] = _mm_sub_epi16(s[0], s[3]);
-  t[4] = s[4];
-  t[7] = s[7];
-
-  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  t[5] = _mm_packs_epi32(u[0], u[1]);
-  t[6] = _mm_packs_epi32(u[2], u[3]);
-
-  t[8] = _mm_add_epi16(s[8], s[11]);
-  t[9] = _mm_add_epi16(s[9], s[10]);
-  t[10] = _mm_sub_epi16(s[9], s[10]);
-  t[11] = _mm_sub_epi16(s[8], s[11]);
-  t[12] = _mm_sub_epi16(s[15], s[12]);
-  t[13] = _mm_sub_epi16(s[14], s[13]);
-  t[14] = _mm_add_epi16(s[13], s[14]);
-  t[15] = _mm_add_epi16(s[12], s[15]);
-
-  // stage 6
-  s[0] = _mm_add_epi16(t[0], t[7]);
-  s[1] = _mm_add_epi16(t[1], t[6]);
-  s[2] = _mm_add_epi16(t[2], t[5]);
-  s[3] = _mm_add_epi16(t[3], t[4]);
-  s[4] = _mm_sub_epi16(t[3], t[4]);
-  s[5] = _mm_sub_epi16(t[2], t[5]);
-  s[6] = _mm_sub_epi16(t[1], t[6]);
-  s[7] = _mm_sub_epi16(t[0], t[7]);
-  s[8] = t[8];
-  s[9] = t[9];
-
-  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
-  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
-  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  s[10] = _mm_packs_epi32(u[0], u[1]);
-  s[13] = _mm_packs_epi32(u[2], u[3]);
-  s[11] = _mm_packs_epi32(u[4], u[5]);
-  s[12] = _mm_packs_epi32(u[6], u[7]);
-  s[14] = t[14];
-  s[15] = t[15];
-
-  // stage 7
-  in[0] = _mm_add_epi16(s[0], s[15]);
-  in[1] = _mm_add_epi16(s[1], s[14]);
-  in[2] = _mm_add_epi16(s[2], s[13]);
-  in[3] = _mm_add_epi16(s[3], s[12]);
-  in[4] = _mm_add_epi16(s[4], s[11]);
-  in[5] = _mm_add_epi16(s[5], s[10]);
-  in[6] = _mm_add_epi16(s[6], s[9]);
-  in[7] = _mm_add_epi16(s[7], s[8]);
-  in[8] = _mm_sub_epi16(s[7], s[8]);
-  in[9] = _mm_sub_epi16(s[6], s[9]);
-  in[10] = _mm_sub_epi16(s[5], s[10]);
-  in[11] = _mm_sub_epi16(s[4], s[11]);
-  in[12] = _mm_sub_epi16(s[3], s[12]);
-  in[13] = _mm_sub_epi16(s[2], s[13]);
-  in[14] = _mm_sub_epi16(s[1], s[14]);
-  in[15] = _mm_sub_epi16(s[0], s[15]);
+void idct16_sse2(__m128i *const in0, __m128i *const in1) {
+  transpose_16bit_16x16(in0, in1);
+  idct16_8col(in0, in0);
+  idct16_8col(in1, in1);
 }
 
-void idct16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  idct16_8col(in0);
-  idct16_8col(in1);
-}
-
-void iadst16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
+void iadst16_sse2(__m128i *const in0, __m128i *const in1) {
+  transpose_16bit_16x16(in0, in1);
   iadst16_8col(in0);
   iadst16_8col(in1);
 }
 
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+                                            __m128i *const out /*out[8]*/) {
   const __m128i zero = _mm_setzero_si128();
+  __m128i step1[8], step2[8];
 
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  // stage 3
+  butterfly(in[4], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
 
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  // stage 4
+  step2[0] = butterfly_cospi16(in[0]);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
 
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[0];
+  step1[2] = step2[0];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[7] = step2[7];
 
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i in[16], l[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
-      stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
-      stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-  // First 1-D inverse DCT
-  // Load input data.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 2);
-  in[2] = load_input_data(input + 8 * 4);
-  in[3] = load_input_data(input + 8 * 6);
-
-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
-  // Stage2
-  {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
-    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
-
-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
-    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
-    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp2_8 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
-
-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
-    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
-    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
-    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
-  }
-
-  // Stage5 and Stage6
-  {
-    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
-    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
-    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
-    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
-    stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
-    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
-    stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
-    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
-    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
-    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
-    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
-    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
-  }
-
-  // Stage6
-  {
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
-    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
-    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
-    stp2_10 = _mm_packs_epi32(tmp0, zero);
-    stp2_13 = _mm_packs_epi32(tmp2, zero);
-    stp2_11 = _mm_packs_epi32(tmp4, zero);
-    stp2_12 = _mm_packs_epi32(tmp6, zero);
-
-    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
-    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
-    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
-    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
-    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
-    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
-    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
-    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
-    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage7. Left 8x16 only.
-  l[0] = _mm_add_epi16(stp2_0, stp1_15);
-  l[1] = _mm_add_epi16(stp2_1, stp1_14);
-  l[2] = _mm_add_epi16(stp2_2, stp2_13);
-  l[3] = _mm_add_epi16(stp2_3, stp2_12);
-  l[4] = _mm_add_epi16(stp2_4, stp2_11);
-  l[5] = _mm_add_epi16(stp2_5, stp2_10);
-  l[6] = _mm_add_epi16(stp2_6, stp1_9);
-  l[7] = _mm_add_epi16(stp2_7, stp1_8);
-  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-  // Second 1-D inverse transform, performed per 8x16 block
-  for (i = 0; i < 2; i++) {
-    int j;
-    array_transpose_4X8(l + 8 * i, in);
-
-    IDCT16_10
-
-    // Stage7
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
+  // stage 6
+  out[0] = _mm_add_epi16(step1[0], step1[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step1[4]);
+  out[4] = _mm_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step1[7]);
 }
 
-#define LOAD_DQCOEFF(reg, input)  \
-  {                               \
-    reg = load_input_data(input); \
-    input += 8;                   \
-  }
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+                                            __m128i *const out /*out[16]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i step1[16], step2[16];
 
-#define IDCT32_34                                                              \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i zero = _mm_setzero_si128();                                  \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
-                                                                               \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]);                   \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]);                   \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero);                   \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero);                   \
-                                                                               \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16,        \
-                             stp1_31);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19,        \
-                             stp1_28);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20,        \
-                             stp1_27);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23,      \
-                             stp1_24);                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i zero = _mm_setzero_si128();                                  \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]);                   \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8,         \
-                             stp2_15);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11,        \
-                             stp2_12);                                         \
-                                                                               \
-    stp2_16 = stp1_16;                                                         \
-    stp2_19 = stp1_19;                                                         \
-                                                                               \
-    stp2_20 = stp1_20;                                                         \
-    stp2_23 = stp1_23;                                                         \
-                                                                               \
-    stp2_24 = stp1_24;                                                         \
-    stp2_27 = stp1_27;                                                         \
-                                                                               \
-    stp2_28 = stp1_28;                                                         \
-    stp2_31 = stp1_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i zero = _mm_setzero_si128();                                  \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4,         \
-                             stp1_7);                                          \
-                                                                               \
-    stp1_8 = stp2_8;                                                           \
-    stp1_11 = stp2_11;                                                         \
-    stp1_12 = stp2_12;                                                         \
-    stp1_15 = stp2_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i zero = _mm_setzero_si128();                                  \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0,         \
-                             stp2_1);                                          \
-                                                                               \
-    stp2_4 = stp1_4;                                                           \
-    stp2_5 = stp1_4;                                                           \
-    stp2_6 = stp1_7;                                                           \
-    stp2_7 = stp1_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = stp2_0;                                                           \
-    stp1_1 = stp2_1;                                                           \
-    stp1_2 = stp2_1;                                                           \
-    stp1_3 = stp2_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
+  // stage 2
+  butterfly(in[2], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly(zero, in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
 
-#define IDCT32                                                                 \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]);                 \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]);                 \
-    const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]);               \
-    const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]);               \
-                                                                               \
-    const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]);                 \
-    const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]);                 \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]);                 \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]);                 \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]);                 \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]);                 \
-    const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]);               \
-    const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]);               \
-                                                                               \
-    const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]);               \
-    const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]);               \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]);                 \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
-                           stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
-                           stp1_30)                                            \
-    MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
-                           stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
-    MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,       \
-                           stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,         \
-                           stp1_21, stp1_26)                                   \
-    MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,      \
-                           stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,        \
-                           stp1_23, stp1_24)                                   \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]);                 \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]);                 \
-    const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]);               \
-    const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]);               \
-                                                                               \
-    const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]);               \
-    const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]);               \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]);                 \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
-                           stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
-                           stp2_14)                                            \
-    MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,       \
-                           stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                 \
-    stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                 \
-                                                                               \
-    stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                 \
-    stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                 \
-                                                                               \
-    stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                 \
-    stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]);                 \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]);                 \
-    const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]);               \
-    const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]);               \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,       \
-                           stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,     \
-                           stp1_6)                                             \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-    stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]);                 \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]);                 \
-    const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]);                 \
-    const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]);                 \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
+  // stage 3
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+
+  idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_34_8x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  idct32_34_8x32_quarter_1(in, temp);
+  idct32_34_8x32_quarter_2(in, temp);
+  // stage 7
+  add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index, 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i step1[32];
+
+  // stage 1
+  butterfly(in[1], zero, cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+  butterfly(zero, in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+  butterfly(in[5], zero, cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+  butterfly(zero, in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+  // stage 3
+  butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17],
+            &step1[30]);
+  butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18],
+            &step1[29]);
+  butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21],
+            &step1[26]);
+  butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22],
+            &step1[25]);
+
+  idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_34_8x32_sse2(const __m128i *const in /*in[32]*/,
+                         __m128i *const out /*out[32]*/) {
+  __m128i temp[32];
+
+  idct32_34_8x32_quarter_1_2(in, temp);
+  idct32_34_8x32_quarter_3_4(in, temp);
+  // final stage
+  add_sub_butterfly(temp, out, 32);
+}
 
 // Only upper-left 8x8 has non-zero coeff
 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
                                int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[32];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i io[32], col[32];
   int i;
 
   // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
+  load_transpose_16bit_8x8(input, 32, io);
+  idct32_34_8x32_sse2(io, col);
 
-  array_transpose_8x8(in, in);
-  IDCT32_34
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  col[0] = _mm_add_epi16(stp1_0, stp1_31);
-  col[1] = _mm_add_epi16(stp1_1, stp1_30);
-  col[2] = _mm_add_epi16(stp1_2, stp1_29);
-  col[3] = _mm_add_epi16(stp1_3, stp1_28);
-  col[4] = _mm_add_epi16(stp1_4, stp1_27);
-  col[5] = _mm_add_epi16(stp1_5, stp1_26);
-  col[6] = _mm_add_epi16(stp1_6, stp1_25);
-  col[7] = _mm_add_epi16(stp1_7, stp1_24);
-  col[8] = _mm_add_epi16(stp1_8, stp1_23);
-  col[9] = _mm_add_epi16(stp1_9, stp1_22);
-  col[10] = _mm_add_epi16(stp1_10, stp1_21);
-  col[11] = _mm_add_epi16(stp1_11, stp1_20);
-  col[12] = _mm_add_epi16(stp1_12, stp1_19);
-  col[13] = _mm_add_epi16(stp1_13, stp1_18);
-  col[14] = _mm_add_epi16(stp1_14, stp1_17);
-  col[15] = _mm_add_epi16(stp1_15, stp1_16);
-  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
-  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
-  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
-  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
-  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
-  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
-  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
-  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
-  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
-  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
-  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
-  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
-  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
-  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
-  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
-  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
-  for (i = 0; i < 4; i++) {
+  for (i = 0; i < 32; i += 8) {
     int j;
-    const __m128i zero = _mm_setzero_si128();
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
-    IDCT32_34
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+    transpose_16bit_8x8(col + i, io);
+    idct32_34_8x32_sse2(io, io);
 
     for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
+      write_buffer_8x1(dest + j * stride, io[j]);
     }
 
     dest += 8;
   }
 }
 
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  butterfly(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+  // stage 4
+  butterfly(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi16(step1[0], step1[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step1[4]);
+  out[4] = _mm_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  butterfly(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+  butterfly(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+  butterfly(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm_add_epi16(step2[11], step2[10]);
+  step1[12] = _mm_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm_add_epi16(step2[15], step2[14]);
+
+  idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_1024_8x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  idct32_1024_8x32_quarter_1(in, temp);
+  idct32_1024_8x32_quarter_2(in, temp);
+  // stage 7
+  add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  butterfly(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+  butterfly(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]);
+  butterfly(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]);
+  butterfly(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+
+  butterfly(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+  butterfly(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]);
+
+  butterfly(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]);
+  butterfly(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi16(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi16(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi16(step1[19], step1[18]);
+  step2[19] = _mm_add_epi16(step1[19], step1[18]);
+  step2[20] = _mm_add_epi16(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi16(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi16(step1[23], step1[22]);
+  step2[23] = _mm_add_epi16(step1[23], step1[22]);
+
+  step2[24] = _mm_add_epi16(step1[24], step1[25]);
+  step2[25] = _mm_sub_epi16(step1[24], step1[25]);
+  step2[26] = _mm_sub_epi16(step1[27], step1[26]);
+  step2[27] = _mm_add_epi16(step1[27], step1[26]);
+  step2[28] = _mm_add_epi16(step1[28], step1[29]);
+  step2[29] = _mm_sub_epi16(step1[28], step1[29]);
+  step2[30] = _mm_sub_epi16(step1[31], step1[30]);
+  step2[31] = _mm_add_epi16(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+            &step1[30]);
+  butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+            &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+            &step1[26]);
+  butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+            &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_1024_8x32(const __m128i *const in /*in[32]*/,
+                      __m128i *const out /*out[32]*/) {
+  __m128i temp[32];
+
+  idct32_1024_8x32_quarter_1_2(in, temp);
+  idct32_1024_8x32_quarter_3_4(in, temp);
+  // final stage
+  add_sub_butterfly(temp, out, 32);
+}
+
 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
                                  int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[128], zero_idx[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
+  __m128i col[4][32], io[32];
+  int i;
 
+  // rows
   for (i = 0; i < 4; i++) {
-    i32 = (i << 5);
-    // First 1-D idct
-    // Load input data.
-    LOAD_DQCOEFF(in[0], input);
-    LOAD_DQCOEFF(in[8], input);
-    LOAD_DQCOEFF(in[16], input);
-    LOAD_DQCOEFF(in[24], input);
-    LOAD_DQCOEFF(in[1], input);
-    LOAD_DQCOEFF(in[9], input);
-    LOAD_DQCOEFF(in[17], input);
-    LOAD_DQCOEFF(in[25], input);
-    LOAD_DQCOEFF(in[2], input);
-    LOAD_DQCOEFF(in[10], input);
-    LOAD_DQCOEFF(in[18], input);
-    LOAD_DQCOEFF(in[26], input);
-    LOAD_DQCOEFF(in[3], input);
-    LOAD_DQCOEFF(in[11], input);
-    LOAD_DQCOEFF(in[19], input);
-    LOAD_DQCOEFF(in[27], input);
-
-    LOAD_DQCOEFF(in[4], input);
-    LOAD_DQCOEFF(in[12], input);
-    LOAD_DQCOEFF(in[20], input);
-    LOAD_DQCOEFF(in[28], input);
-    LOAD_DQCOEFF(in[5], input);
-    LOAD_DQCOEFF(in[13], input);
-    LOAD_DQCOEFF(in[21], input);
-    LOAD_DQCOEFF(in[29], input);
-    LOAD_DQCOEFF(in[6], input);
-    LOAD_DQCOEFF(in[14], input);
-    LOAD_DQCOEFF(in[22], input);
-    LOAD_DQCOEFF(in[30], input);
-    LOAD_DQCOEFF(in[7], input);
-    LOAD_DQCOEFF(in[15], input);
-    LOAD_DQCOEFF(in[23], input);
-    LOAD_DQCOEFF(in[31], input);
-
-    // checking if all entries are zero
-    zero_idx[0] = _mm_or_si128(in[0], in[1]);
-    zero_idx[1] = _mm_or_si128(in[2], in[3]);
-    zero_idx[2] = _mm_or_si128(in[4], in[5]);
-    zero_idx[3] = _mm_or_si128(in[6], in[7]);
-    zero_idx[4] = _mm_or_si128(in[8], in[9]);
-    zero_idx[5] = _mm_or_si128(in[10], in[11]);
-    zero_idx[6] = _mm_or_si128(in[12], in[13]);
-    zero_idx[7] = _mm_or_si128(in[14], in[15]);
-    zero_idx[8] = _mm_or_si128(in[16], in[17]);
-    zero_idx[9] = _mm_or_si128(in[18], in[19]);
-    zero_idx[10] = _mm_or_si128(in[20], in[21]);
-    zero_idx[11] = _mm_or_si128(in[22], in[23]);
-    zero_idx[12] = _mm_or_si128(in[24], in[25]);
-    zero_idx[13] = _mm_or_si128(in[26], in[27]);
-    zero_idx[14] = _mm_or_si128(in[28], in[29]);
-    zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
-    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
-    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
-    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    }
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
-
-    IDCT32
-
-    // 1_D: Store 32 intermediate results for each 8x32 block.
-    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+    load_transpose_16bit_8x8(&input[0], 32, &io[0]);
+    load_transpose_16bit_8x8(&input[8], 32, &io[8]);
+    load_transpose_16bit_8x8(&input[16], 32, &io[16]);
+    load_transpose_16bit_8x8(&input[24], 32, &io[24]);
+    idct32_1024_8x32(io, col[i]);
+    input += 32 << 3;
   }
-  for (i = 0; i < 4; i++) {
-    // Second 1-D idct
-    j = i << 3;
 
+  // columns
+  for (i = 0; i < 32; i += 8) {
     // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
+    transpose_16bit_8x8(col[0] + i, io);
+    transpose_16bit_8x8(col[1] + i, io + 8);
+    transpose_16bit_8x8(col[2] + i, io + 16);
+    transpose_16bit_8x8(col[3] + i, io + 24);
 
-    IDCT32
+    idct32_1024_8x32(io, io);
+    store_buffer_8x32(io, dest, stride);
+    dest += 8;
+  }
+}
 
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  __m128i col[2][32], in[32], out[32];
+  int i;
 
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
+  for (i = 16; i < 32; i++) {
+    in[i] = _mm_setzero_si128();
+  }
 
+  // rows
+  for (i = 0; i < 2; i++) {
+    load_transpose_16bit_8x8(&input[0], 32, &in[0]);
+    load_transpose_16bit_8x8(&input[8], 32, &in[8]);
+    idct32_1024_8x32(in, col[i]);
+    input += 32 << 3;
+  }
+
+  // columns
+  for (i = 0; i < 32; i += 8) {
+    transpose_16bit_8x8(col[0] + i, in);
+    transpose_16bit_8x8(col[1] + i, in + 8);
+    idct32_1024_8x32(in, out);
+    store_buffer_8x32(out, dest, stride);
     dest += 8;
   }
 }
@@ -3431,611 +1334,17 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
                               int stride) {
   __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, j;
+  int j;
+  tran_high_t a1;
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  dc_value = _mm_set1_epi16((int16_t)a1);
 
   for (j = 0; j < 32; ++j) {
-    RECON_AND_STORE(dest + 0 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 8 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 16 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 24 + j * stride, dc_value);
+    recon_and_store_16(dest + j * stride + 0, dc_value);
+    recon_and_store_16(dest + j * stride + 16, dc_value);
   }
 }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
-  __m128i ubounded, retval;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
-  ubounded = _mm_cmpgt_epi16(value, max);
-  retval = _mm_andnot_si128(ubounded, value);
-  ubounded = _mm_and_si128(ubounded, max);
-  retval = _mm_or_si128(retval, ubounded);
-  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
-  return retval;
-}
-
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  __m128i inptr[4];
-  __m128i sign_bits[2];
-  __m128i temp_mm, min_input, max_input;
-  int test;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  int optimised_cols = 0;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i max = _mm_set1_epi16(12043);
-  const __m128i min = _mm_set1_epi16(-12043);
-  // Load input into __m128i
-  inptr[0] = _mm_loadu_si128((const __m128i *)input);
-  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
-  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
-  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
-  // Pack to 16 bits
-  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
-  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp_mm = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp_mm);
-
-  if (!test) {
-    // Do the row transform
-    idct4_sse2(inptr);
-
-    // Check the min & max values
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp_mm = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp_mm);
-
-    if (test) {
-      transpose_4x4(inptr);
-      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
-      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
-      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
-      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
-      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
-      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
-      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
-      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
-      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
-      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct4_c(input, outptr, bd);
-      input += 4;
-      outptr += 4;
-    }
-  }
-
-  if (optimised_cols) {
-    idct4_sse2(inptr);
-
-    // Final round and shift
-    inptr[0] = _mm_add_epi16(inptr[0], eight);
-    inptr[1] = _mm_add_epi16(inptr[1], eight);
-
-    inptr[0] = _mm_srai_epi16(inptr[0], 4);
-    inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
-    // Reconstruction and Store
-    {
-      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
-      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
-      d0 = _mm_unpacklo_epi64(
-          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
-      d2 = _mm_unpacklo_epi64(
-          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
-      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
-      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
-      // store input0
-      _mm_storel_epi64((__m128i *)dest, d0);
-      // store input1
-      d0 = _mm_srli_si128(d0, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride), d0);
-      // store input2
-      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
-      // store input3
-      d2 = _mm_srli_si128(d2, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[4], temp_out[4];
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-      vpx_highbd_idct4_c(temp_in, temp_out, bd);
-      for (j = 0; j < 4; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 8; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_8x8(inptr, inptr);
-      for (i = 0; i < 8; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 8; ++i) {
-      vpx_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
-
-  if (optimised_cols) {
-    idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-      vpx_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  // only first 4 row has non-zero coefs
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_4X8(inptr, inptr);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
-
-  if (optimised_cols) {
-    idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-      vpx_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                       int stride, int bd) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 32; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 32; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_16x16(inptr, inptr + 16);
-      for (i = 0; i < 16; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 16; ++i) {
-      vpx_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
-
-  if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], rounding);
-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
-        inptr[i] = _mm_srai_epi16(inptr[i], 6);
-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-      vpx_highbd_idct16_c(temp_in, temp_out, bd);
-      for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                      int stride, int bd) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  // Since all non-zero dct coefficients are in upper-left 4x4 area,
-  // we only need to consider first 4 rows here.
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform (N.B. This transposes inptr)
-    idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 16; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_8x8(inptr, inptr);
-      array_transpose_8x8(inptr + 8, inptr + 16);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
-
-  if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], rounding);
-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
-        inptr[i] = _mm_srai_epi16(inptr[i], 6);
-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-      vpx_highbd_idct16_c(temp_in, temp_out, bd);
-      for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                     int stride, int bd) {
-  __m128i dc_value, d;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
-  int a, i, j;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  tran_low_t out;
-
-  out = highbd_dct_const_round_shift(input[0] * cospi_16_64);
-  out = highbd_dct_const_round_shift(out * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(out, 6);
-
-  d = _mm_set1_epi32(a);
-  dc_value = _mm_packs_epi32(d, d);
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 4; ++j) {
-      d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));
-      d = _mm_adds_epi16(d, dc_value);
-      d = _mm_max_epi16(d, zero);
-      d = _mm_min_epi16(d, max);
-      _mm_storeu_si128((__m128i *)(&dest[j * 8]), d);
-    }
-    dest += stride;
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
index d762a04abc..5cd5098f14 100644
--- a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
@@ -12,185 +12,698 @@
 #define VPX_DSP_X86_INV_TXFM_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
+
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
-// perform 8x8 transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in,
+                                                  __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 30 31 32 33  00 01 02 03
+  // in[1]: 20 21 22 23  10 11 12 13
+  // in[2]: 40 41 42 43  70 71 72 73
+  // in[3]: 50 51 52 53  60 61 62 63
+  // to:
+  // tr0_0: 00 10 01 11  02 12 03 13
+  // tr0_1: 20 30 21 31  22 32 23 33
+  // tr0_2: 40 50 41 51  42 52 43 53
+  // tr0_3: 60 70 61 71  62 72 63 73
+  const __m128i tr0_0 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[1], in[0]);
+  const __m128i tr0_2 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[3], in[2]);
 
+  // Unpack 32 bit elements resulting in:
+  // tr1_0: 00 10 20 30  01 11 21 31
+  // tr1_1: 02 12 22 32  03 13 23 33
+  // tr1_2: 40 50 60 70  41 51 61 71
+  // tr1_3: 42 52 62 72  43 53 63 73
   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
   const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
 
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
 }
 
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1)   \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-                                                        \
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
-    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
-  }
-
-static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) {
+  const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING));
+  return _mm_srai_epi32(t, DCT_CONST_BITS);
 }
 
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
+static INLINE __m128i idct_madd_round_shift_sse2(const __m128i in,
+                                                 const __m128i cospi) {
+  const __m128i t = _mm_madd_epi16(in, cospi);
+  return dct_const_round_shift_sse2(t);
 }
 
-// Function to allow 8 bit optimisations to be used when profile 0 is used with
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0,
+                                             const __m128i in1,
+                                             const __m128i x) {
+  const __m128i t0 = idct_madd_round_shift_sse2(in0, x);
+  const __m128i t1 = idct_madd_round_shift_sse2(in1, x);
+  return _mm_packs_epi32(t0, t1);
+}
+
+// Multiply elements by constants and add them together.
+static INLINE void butterfly(const __m128i in0, const __m128i in1, const int c0,
+                             const int c1, __m128i *const out0,
+                             __m128i *const out1) {
+  const __m128i cst0 = pair_set_epi16(c0, -c1);
+  const __m128i cst1 = pair_set_epi16(c1, c0);
+  const __m128i lo = _mm_unpacklo_epi16(in0, in1);
+  const __m128i hi = _mm_unpackhi_epi16(in0, in1);
+  *out0 = idct_calc_wraplow_sse2(lo, hi, cst0);
+  *out1 = idct_calc_wraplow_sse2(lo, hi, cst1);
+}
+
+static INLINE __m128i butterfly_cospi16(const __m128i in) {
+  const __m128i cst = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i lo = _mm_unpacklo_epi16(in, _mm_setzero_si128());
+  const __m128i hi = _mm_unpackhi_epi16(in, _mm_setzero_si128());
+  return idct_calc_wraplow_sse2(lo, hi, cst);
+}
+
+// Functions to allow 8 bit optimisations to be used when profile 0 is used with
 // highbitdepth enabled
-static INLINE __m128i load_input_data(const tran_low_t *data) {
+static INLINE __m128i load_input_data4(const tran_low_t *data) {
 #if CONFIG_VP9_HIGHBITDEPTH
-  return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
-                        data[6], data[7]);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i in = _mm_load_si128((const __m128i *)data);
+  return _mm_packs_epi32(in, zero);
+#else
+  return _mm_loadl_epi64((const __m128i *)data);
+#endif
+}
+
+static INLINE __m128i load_input_data8(const tran_low_t *data) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i in0 = _mm_load_si128((const __m128i *)data);
+  const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4));
+  return _mm_packs_epi32(in0, in1);
 #else
   return _mm_load_si128((const __m128i *)data);
 #endif
 }
 
-static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
-  in[0] = load_input_data(input + 0 * 16);
-  in[1] = load_input_data(input + 1 * 16);
-  in[2] = load_input_data(input + 2 * 16);
-  in[3] = load_input_data(input + 3 * 16);
-  in[4] = load_input_data(input + 4 * 16);
-  in[5] = load_input_data(input + 5 * 16);
-  in[6] = load_input_data(input + 6 * 16);
-  in[7] = load_input_data(input + 7 * 16);
-
-  in[8] = load_input_data(input + 8 * 16);
-  in[9] = load_input_data(input + 9 * 16);
-  in[10] = load_input_data(input + 10 * 16);
-  in[11] = load_input_data(input + 11 * 16);
-  in[12] = load_input_data(input + 12 * 16);
-  in[13] = load_input_data(input + 13 * 16);
-  in[14] = load_input_data(input + 14 * 16);
-  in[15] = load_input_data(input + 15 * 16);
+static INLINE void load_transpose_16bit_8x8(const tran_low_t *input,
+                                            const int stride,
+                                            __m128i *const in) {
+  in[0] = load_input_data8(input + 0 * stride);
+  in[1] = load_input_data8(input + 1 * stride);
+  in[2] = load_input_data8(input + 2 * stride);
+  in[3] = load_input_data8(input + 3 * stride);
+  in[4] = load_input_data8(input + 4 * stride);
+  in[5] = load_input_data8(input + 5 * stride);
+  in[6] = load_input_data8(input + 6 * stride);
+  in[7] = load_input_data8(input + 7 * stride);
+  transpose_16bit_8x8(in, in);
 }
 
-#define RECON_AND_STORE(dest, in_x)                  \
-  {                                                  \
-    __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                \
-    d0 = _mm_add_epi16(in_x, d0);                    \
-    d0 = _mm_packus_epi16(d0, d0);                   \
-    _mm_storel_epi64((__m128i *)(dest), d0);         \
+static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d0 = _mm_loadl_epi64((__m128i *)(dest));
+  d0 = _mm_unpacklo_epi8(d0, zero);
+  d0 = _mm_add_epi16(in_x, d0);
+  d0 = _mm_packus_epi16(d0, d0);
+  _mm_storel_epi64((__m128i *)(dest), d0);
+}
+
+static INLINE void round_shift_8x8(const __m128i *const in,
+                                   __m128i *const out) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+
+  out[0] = _mm_add_epi16(in[0], final_rounding);
+  out[1] = _mm_add_epi16(in[1], final_rounding);
+  out[2] = _mm_add_epi16(in[2], final_rounding);
+  out[3] = _mm_add_epi16(in[3], final_rounding);
+  out[4] = _mm_add_epi16(in[4], final_rounding);
+  out[5] = _mm_add_epi16(in[5], final_rounding);
+  out[6] = _mm_add_epi16(in[6], final_rounding);
+  out[7] = _mm_add_epi16(in[7], final_rounding);
+
+  out[0] = _mm_srai_epi16(out[0], 5);
+  out[1] = _mm_srai_epi16(out[1], 5);
+  out[2] = _mm_srai_epi16(out[2], 5);
+  out[3] = _mm_srai_epi16(out[3], 5);
+  out[4] = _mm_srai_epi16(out[4], 5);
+  out[5] = _mm_srai_epi16(out[5], 5);
+  out[6] = _mm_srai_epi16(out[6], 5);
+  out[7] = _mm_srai_epi16(out[7], 5);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *const in,
+                                    uint8_t *const dest, const int stride) {
+  __m128i t[8];
+
+  round_shift_8x8(in, t);
+
+  recon_and_store(dest + 0 * stride, t[0]);
+  recon_and_store(dest + 1 * stride, t[1]);
+  recon_and_store(dest + 2 * stride, t[2]);
+  recon_and_store(dest + 3 * stride, t[3]);
+  recon_and_store(dest + 4 * stride, t[4]);
+  recon_and_store(dest + 5 * stride, t[5]);
+  recon_and_store(dest + 6 * stride, t[6]);
+  recon_and_store(dest + 7 * stride, t[7]);
+}
+
+static INLINE void recon_and_store4x4_sse2(const __m128i *const in,
+                                           uint8_t *const dest,
+                                           const int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d[2];
+
+  // Reconstruction and Store
+  d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
+  d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+  d[0] = _mm_unpacklo_epi32(d[0],
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+  d[1] = _mm_unpacklo_epi32(
+      _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
+  d[0] = _mm_unpacklo_epi8(d[0], zero);
+  d[1] = _mm_unpacklo_epi8(d[1], zero);
+  d[0] = _mm_add_epi16(d[0], in[0]);
+  d[1] = _mm_add_epi16(d[1], in[1]);
+  d[0] = _mm_packus_epi16(d[0], d[1]);
+
+  *(int *)dest = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
+}
+
+static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  int j = 0;
+  while (j < 32) {
+    in[j] = _mm_adds_epi16(in[j], final_rounding);
+    in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
+
+    in[j] = _mm_srai_epi16(in[j], 6);
+    in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
+
+    recon_and_store(dst, in[j]);
+    dst += stride;
+    recon_and_store(dst, in[j + 1]);
+    dst += stride;
+    j += 2;
+  }
+}
+
+static INLINE void write_buffer_8x1(uint8_t *const dest, const __m128i in) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  __m128i out;
+  out = _mm_adds_epi16(in, final_rounding);
+  out = _mm_srai_epi16(out, 6);
+  recon_and_store(dest, out);
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
+                                     int size) {
+  int i = 0;
+  const int num = size >> 1;
+  const int bound = size - 1;
+  while (i < num) {
+    out[i] = _mm_add_epi16(in[i], in[bound - i]);
+    out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
+    i++;
+  }
+}
+
+static INLINE void idct8(const __m128i *const in /*in[8]*/,
+                         __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 1
+  butterfly(in[1], in[7], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly(in[5], in[3], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+  // stage 2
+  butterfly(in[0], in[4], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly(in[2], in[6], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+
+  // stage 4
+  out[0] = _mm_add_epi16(step1[0], step2[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step2[4]);
+  out[4] = _mm_sub_epi16(step1[3], step2[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_add_kernel_sse2(__m128i *const io /*io[8]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  __m128i step1[8], step2[8], tmp[4];
+
+  transpose_16bit_4x4(io, io);
+  // io[0]: 00 10 20 30  01 11 21 31
+  // io[1]: 02 12 22 32  03 13 23 33
+
+  // stage 1
+  {
+    const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+    const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+    const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+    const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+    const __m128i lo_1 = _mm_unpackhi_epi16(io[0], zero);
+    const __m128i lo_3 = _mm_unpackhi_epi16(io[1], zero);
+    step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1);    // step1 4&7
+    step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3);  // step1 5&6
   }
 
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-  in[8] = _mm_adds_epi16(in[8], final_rounding);
-  in[9] = _mm_adds_epi16(in[9], final_rounding);
-  in[10] = _mm_adds_epi16(in[10], final_rounding);
-  in[11] = _mm_adds_epi16(in[11], final_rounding);
-  in[12] = _mm_adds_epi16(in[12], final_rounding);
-  in[13] = _mm_adds_epi16(in[13], final_rounding);
-  in[14] = _mm_adds_epi16(in[14], final_rounding);
-  in[15] = _mm_adds_epi16(in[15], final_rounding);
+  // stage 2
+  {
+    const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+    const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+    const __m128i lo_0 = _mm_unpacklo_epi16(io[0], zero);
+    const __m128i lo_2 = _mm_unpacklo_epi16(io[1], zero);
+    const __m128i t = idct_madd_round_shift_sse2(cp_16_16, lo_0);
+    step2[0] = _mm_packs_epi32(t, t);                            // step2 0&1
+    step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2);  // step2 3&2
+    step2[4] = _mm_add_epi16(step1[4], step1[5]);                // step2 4&7
+    step2[5] = _mm_sub_epi16(step1[4], step1[5]);                // step2 5&6
+    step2[6] = _mm_unpackhi_epi64(step2[5], zero);               // step2 6
+  }
 
-  in[0] = _mm_srai_epi16(in[0], 6);
-  in[1] = _mm_srai_epi16(in[1], 6);
-  in[2] = _mm_srai_epi16(in[2], 6);
-  in[3] = _mm_srai_epi16(in[3], 6);
-  in[4] = _mm_srai_epi16(in[4], 6);
-  in[5] = _mm_srai_epi16(in[5], 6);
-  in[6] = _mm_srai_epi16(in[6], 6);
-  in[7] = _mm_srai_epi16(in[7], 6);
-  in[8] = _mm_srai_epi16(in[8], 6);
-  in[9] = _mm_srai_epi16(in[9], 6);
-  in[10] = _mm_srai_epi16(in[10], 6);
-  in[11] = _mm_srai_epi16(in[11], 6);
-  in[12] = _mm_srai_epi16(in[12], 6);
-  in[13] = _mm_srai_epi16(in[13], 6);
-  in[14] = _mm_srai_epi16(in[14], 6);
-  in[15] = _mm_srai_epi16(in[15], 6);
+  // stage 3
+  {
+    const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]);
+    tmp[0] = _mm_add_epi16(step2[0], step2[2]);                     // step1 0&1
+    tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                     // step1 3&2
+    step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                  // step1 2&1
+    step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                  // step1 3&0
+    step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65);  // step1 5&6
+  }
 
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
-  RECON_AND_STORE(dest + 8 * stride, in[8]);
-  RECON_AND_STORE(dest + 9 * stride, in[9]);
-  RECON_AND_STORE(dest + 10 * stride, in[10]);
-  RECON_AND_STORE(dest + 11 * stride, in[11]);
-  RECON_AND_STORE(dest + 12 * stride, in[12]);
-  RECON_AND_STORE(dest + 13 * stride, in[13]);
-  RECON_AND_STORE(dest + 14 * stride, in[14]);
-  RECON_AND_STORE(dest + 15 * stride, in[15]);
+  // stage 4
+  tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0
+  tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1
+  tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7
+  tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6
+
+  idct8x8_12_transpose_16bit_4x8(tmp, io);
+  io[4] = io[5] = io[6] = io[7] = zero;
+
+  idct8(io, io);
 }
 
-void idct4_sse2(__m128i *in);
-void idct8_sse2(__m128i *in);
-void idct16_sse2(__m128i *in0, __m128i *in1);
-void iadst4_sse2(__m128i *in);
-void iadst8_sse2(__m128i *in);
-void iadst16_sse2(__m128i *in0, __m128i *in1);
+static INLINE void idct16_8col(const __m128i *const in /*in[16]*/,
+                               __m128i *const out /*out[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  butterfly(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+  butterfly(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+  butterfly(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  butterfly(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+  step1[8] = _mm_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm_add_epi16(step2[10], step2[11]);
+  step1[12] = _mm_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm_add_epi16(step2[14], step2[15]);
+
+  // stage 4
+  butterfly(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+            &step2[14]);
+  butterfly(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13],
+            &step2[10]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step1[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step1[7] = _mm_add_epi16(step1[6], step1[7]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[8] = _mm_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = _mm_add_epi16(step1[0], step1[7]);
+  step2[1] = _mm_add_epi16(step1[1], step1[6]);
+  step2[2] = _mm_add_epi16(step1[2], step1[5]);
+  step2[3] = _mm_add_epi16(step1[3], step1[4]);
+  step2[4] = _mm_sub_epi16(step1[3], step1[4]);
+  step2[5] = _mm_sub_epi16(step1[2], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[1], step1[6]);
+  step2[7] = _mm_sub_epi16(step1[0], step1[7]);
+  butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+            &step2[13]);
+  butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+            &step2[12]);
+
+  // stage 7
+  out[0] = _mm_add_epi16(step2[0], step1[15]);
+  out[1] = _mm_add_epi16(step2[1], step1[14]);
+  out[2] = _mm_add_epi16(step2[2], step2[13]);
+  out[3] = _mm_add_epi16(step2[3], step2[12]);
+  out[4] = _mm_add_epi16(step2[4], step2[11]);
+  out[5] = _mm_add_epi16(step2[5], step2[10]);
+  out[6] = _mm_add_epi16(step2[6], step1[9]);
+  out[7] = _mm_add_epi16(step2[7], step1[8]);
+  out[8] = _mm_sub_epi16(step2[7], step1[8]);
+  out[9] = _mm_sub_epi16(step2[6], step1[9]);
+  out[10] = _mm_sub_epi16(step2[5], step2[10]);
+  out[11] = _mm_sub_epi16(step2[4], step2[11]);
+  out[12] = _mm_sub_epi16(step2[3], step2[12]);
+  out[13] = _mm_sub_epi16(step2[2], step2[13]);
+  out[14] = _mm_sub_epi16(step2[1], step1[14]);
+  out[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct16x16_10_pass1(const __m128i *const input /*input[4]*/,
+                                      __m128i *const output /*output[16]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  __m128i step1[16], step2[16];
+
+  transpose_16bit_4x4(input, output);
+
+  // stage 2
+  {
+    const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+    const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+    const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+    const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+    const __m128i lo_1_15 = _mm_unpackhi_epi16(output[0], zero);
+    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, output[1]);
+    step2[8] = idct_calc_wraplow_sse2(k__cospi_p30_m02, k__cospi_p02_p30,
+                                      lo_1_15);  // step2 8&15
+    step2[11] = idct_calc_wraplow_sse2(k__cospi_p06_m26, k__cospi_p26_p06,
+                                       lo_13_3);  // step2 11&12
+  }
+
+  // stage 3
+  {
+    const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+    const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(output[1], zero);
+    step1[4] = idct_calc_wraplow_sse2(k__cospi_p28_m04, k__cospi_p04_p28,
+                                      lo_2_14);  // step1 4&7
+    step1[13] = _mm_unpackhi_epi64(step2[11], zero);
+    step1[14] = _mm_unpackhi_epi64(step2[8], zero);
+  }
+
+  // stage 4
+  {
+    const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+    const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+    const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(output[0], zero);
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(step2[8], step1[14]);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(step2[11], step1[13]);
+    const __m128i t = idct_madd_round_shift_sse2(lo_0_8, k__cospi_p16_p16);
+    step1[0] = _mm_packs_epi32(t, t);  // step2 0&1
+    step2[9] = idct_calc_wraplow_sse2(k__cospi_m08_p24, k__cospi_p24_p08,
+                                      lo_9_14);  // step2 9&14
+    step2[10] = idct_calc_wraplow_sse2(k__cospi_m24_m08, k__cospi_m08_p24,
+                                       lo_10_13);  // step2 10&13
+    step2[6] = _mm_unpackhi_epi64(step1[4], zero);
+  }
+
+  // stage 5
+  {
+    const __m128i lo_5_6 = _mm_unpacklo_epi16(step1[4], step2[6]);
+    step1[6] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_m16_p16,
+                                      lo_5_6);  // step1 6&5
+    step1[8] = _mm_add_epi16(step2[8], step2[11]);
+    step1[9] = _mm_add_epi16(step2[9], step2[10]);
+    step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+    step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+    step1[12] = _mm_unpackhi_epi64(step1[11], zero);
+    step1[13] = _mm_unpackhi_epi64(step1[10], zero);
+    step1[14] = _mm_unpackhi_epi64(step1[9], zero);
+    step1[15] = _mm_unpackhi_epi64(step1[8], zero);
+  }
+
+  // stage 6
+  {
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(step1[10], step1[13]);
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(step1[11], step1[12]);
+    step2[10] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16,
+                                       lo_10_13);  // step2 10&13
+    step2[11] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16,
+                                       lo_11_12);  // step2 11&12
+    step2[13] = _mm_unpackhi_epi64(step2[10], zero);
+    step2[12] = _mm_unpackhi_epi64(step2[11], zero);
+    step2[3] = _mm_add_epi16(step1[0], step1[4]);
+    step2[1] = _mm_add_epi16(step1[0], step1[6]);
+    step2[6] = _mm_sub_epi16(step1[0], step1[6]);
+    step2[4] = _mm_sub_epi16(step1[0], step1[4]);
+    step2[0] = _mm_unpackhi_epi64(step2[3], zero);
+    step2[2] = _mm_unpackhi_epi64(step2[1], zero);
+    step2[5] = _mm_unpackhi_epi64(step2[6], zero);
+    step2[7] = _mm_unpackhi_epi64(step2[4], zero);
+  }
+
+  // stage 7. Left 8x16 only.
+  output[0] = _mm_add_epi16(step2[0], step1[15]);
+  output[1] = _mm_add_epi16(step2[1], step1[14]);
+  output[2] = _mm_add_epi16(step2[2], step2[13]);
+  output[3] = _mm_add_epi16(step2[3], step2[12]);
+  output[4] = _mm_add_epi16(step2[4], step2[11]);
+  output[5] = _mm_add_epi16(step2[5], step2[10]);
+  output[6] = _mm_add_epi16(step2[6], step1[9]);
+  output[7] = _mm_add_epi16(step2[7], step1[8]);
+  output[8] = _mm_sub_epi16(step2[7], step1[8]);
+  output[9] = _mm_sub_epi16(step2[6], step1[9]);
+  output[10] = _mm_sub_epi16(step2[5], step2[10]);
+  output[11] = _mm_sub_epi16(step2[4], step2[11]);
+  output[12] = _mm_sub_epi16(step2[3], step2[12]);
+  output[13] = _mm_sub_epi16(step2[2], step2[13]);
+  output[14] = _mm_sub_epi16(step2[1], step1[14]);
+  output[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct16x16_10_pass2(__m128i *const l /*l[8]*/,
+                                      __m128i *const io /*io[16]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i step1[16], step2[16];
+
+  transpose_16bit_4x8(l, io);
+
+  // stage 2
+  butterfly(io[1], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly(zero, io[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  butterfly(io[2], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+
+  // stage 4
+  step1[0] = butterfly_cospi16(io[0]);
+  butterfly(step2[15], step2[8], cospi_24_64, cospi_8_64, &step2[9],
+            &step2[14]);
+  butterfly(step2[11], step2[12], -cospi_8_64, -cospi_24_64, &step2[13],
+            &step2[10]);
+
+  // stage 5
+  butterfly(step1[7], step1[4], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[8] = _mm_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = _mm_add_epi16(step1[0], step1[7]);
+  step2[1] = _mm_add_epi16(step1[0], step1[6]);
+  step2[2] = _mm_add_epi16(step1[0], step1[5]);
+  step2[3] = _mm_add_epi16(step1[0], step1[4]);
+  step2[4] = _mm_sub_epi16(step1[0], step1[4]);
+  step2[5] = _mm_sub_epi16(step1[0], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[0], step1[6]);
+  step2[7] = _mm_sub_epi16(step1[0], step1[7]);
+  butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+            &step2[13]);
+  butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+            &step2[12]);
+
+  // stage 7
+  io[0] = _mm_add_epi16(step2[0], step1[15]);
+  io[1] = _mm_add_epi16(step2[1], step1[14]);
+  io[2] = _mm_add_epi16(step2[2], step2[13]);
+  io[3] = _mm_add_epi16(step2[3], step2[12]);
+  io[4] = _mm_add_epi16(step2[4], step2[11]);
+  io[5] = _mm_add_epi16(step2[5], step2[10]);
+  io[6] = _mm_add_epi16(step2[6], step1[9]);
+  io[7] = _mm_add_epi16(step2[7], step1[8]);
+  io[8] = _mm_sub_epi16(step2[7], step1[8]);
+  io[9] = _mm_sub_epi16(step2[6], step1[9]);
+  io[10] = _mm_sub_epi16(step2[5], step2[10]);
+  io[11] = _mm_sub_epi16(step2[4], step2[11]);
+  io[12] = _mm_sub_epi16(step2[3], step2[12]);
+  io[13] = _mm_sub_epi16(step2[2], step2[13]);
+  io[14] = _mm_sub_epi16(step2[1], step1[14]);
+  io[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct32_8x32_quarter_2_stage_4_to_6(
+    __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+            &step2[14]);
+  butterfly(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10],
+            &step2[13]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[8] = _mm_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  out[8] = step1[8];
+  out[9] = step1[9];
+  butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], &out[13]);
+  butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], &out[12]);
+  out[14] = step1[14];
+  out[15] = step1[15];
+}
+
+static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7(
+    __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[16] = _mm_add_epi16(step1[16], step1[19]);
+  step2[17] = _mm_add_epi16(step1[17], step1[18]);
+  step2[18] = _mm_sub_epi16(step1[17], step1[18]);
+  step2[19] = _mm_sub_epi16(step1[16], step1[19]);
+  step2[20] = _mm_sub_epi16(step1[23], step1[20]);
+  step2[21] = _mm_sub_epi16(step1[22], step1[21]);
+  step2[22] = _mm_add_epi16(step1[22], step1[21]);
+  step2[23] = _mm_add_epi16(step1[23], step1[20]);
+
+  step2[24] = _mm_add_epi16(step1[24], step1[27]);
+  step2[25] = _mm_add_epi16(step1[25], step1[26]);
+  step2[26] = _mm_sub_epi16(step1[25], step1[26]);
+  step2[27] = _mm_sub_epi16(step1[24], step1[27]);
+  step2[28] = _mm_sub_epi16(step1[31], step1[28]);
+  step2[29] = _mm_sub_epi16(step1[30], step1[29]);
+  step2[30] = _mm_add_epi16(step1[29], step1[30]);
+  step2[31] = _mm_add_epi16(step1[28], step1[31]);
+
+  // stage 5
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  butterfly(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18],
+            &step1[29]);
+  butterfly(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19],
+            &step1[28]);
+  butterfly(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20],
+            &step1[27]);
+  butterfly(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21],
+            &step1[26]);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  out[16] = _mm_add_epi16(step1[16], step1[23]);
+  out[17] = _mm_add_epi16(step1[17], step1[22]);
+  out[18] = _mm_add_epi16(step1[18], step1[21]);
+  out[19] = _mm_add_epi16(step1[19], step1[20]);
+  step2[20] = _mm_sub_epi16(step1[19], step1[20]);
+  step2[21] = _mm_sub_epi16(step1[18], step1[21]);
+  step2[22] = _mm_sub_epi16(step1[17], step1[22]);
+  step2[23] = _mm_sub_epi16(step1[16], step1[23]);
+
+  step2[24] = _mm_sub_epi16(step1[31], step1[24]);
+  step2[25] = _mm_sub_epi16(step1[30], step1[25]);
+  step2[26] = _mm_sub_epi16(step1[29], step1[26]);
+  step2[27] = _mm_sub_epi16(step1[28], step1[27]);
+  out[28] = _mm_add_epi16(step1[27], step1[28]);
+  out[29] = _mm_add_epi16(step1[26], step1[29]);
+  out[30] = _mm_add_epi16(step1[25], step1[30]);
+  out[31] = _mm_add_epi16(step1[24], step1[31]);
+
+  // stage 7
+  butterfly(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], &out[27]);
+  butterfly(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], &out[26]);
+  butterfly(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], &out[25]);
+  butterfly(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], &out[24]);
+}
+
+void idct4_sse2(__m128i *const in);
+void idct8_sse2(__m128i *const in);
+void idct16_sse2(__m128i *const in0, __m128i *const in1);
+void iadst4_sse2(__m128i *const in);
+void iadst8_sse2(__m128i *const in);
+void iadst16_sse2(__m128i *const in0, __m128i *const in1);
+void idct32_1024_8x32(const __m128i *const in, __m128i *const out);
+void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out);
+void idct32_34_8x32_ssse3(const __m128i *const in, __m128i *const out);
 
 #endif  // VPX_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
new file mode 100644
index 0000000000..6e99469b63
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -0,0 +1,364 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void partial_butterfly_ssse3(const __m128i in, const int c0,
+                                           const int c1, __m128i *const out0,
+                                           __m128i *const out1) {
+  const __m128i cst0 = _mm_set1_epi16(2 * c0);
+  const __m128i cst1 = _mm_set1_epi16(2 * c1);
+  *out0 = _mm_mulhrs_epi16(in, cst0);
+  *out1 = _mm_mulhrs_epi16(in, cst1);
+}
+
+static INLINE __m128i partial_butterfly_cospi16_ssse3(const __m128i in) {
+  const __m128i coef_pair = _mm_set1_epi16(2 * cospi_16_64);
+  return _mm_mulhrs_epi16(in, coef_pair);
+}
+
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
+  __m128i io[8];
+
+  io[0] = load_input_data4(input + 0 * 8);
+  io[1] = load_input_data4(input + 1 * 8);
+  io[2] = load_input_data4(input + 2 * 8);
+  io[3] = load_input_data4(input + 3 * 8);
+
+  idct8x8_12_add_kernel_ssse3(io);
+  write_buffer_8x8(io, dest, stride);
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+                                            __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+
+  // stage 4
+  step2[0] = partial_butterfly_cospi16_ssse3(in[0]);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[0];
+  step1[2] = step2[0];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi16(step1[0], step1[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step1[4]);
+  out[4] = _mm_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+                                            __m128i *const out /*out[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                          &step2[15]);
+  partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+                          &step2[12]);
+
+  // stage 3
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+
+  idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_34_8x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  idct32_34_8x32_quarter_1(in, temp);
+  idct32_34_8x32_quarter_2(in, temp);
+  // stage 7
+  add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index, 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32];
+
+  // stage 1
+  partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                          &step1[31]);
+  partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+                          &step1[28]);
+  partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                          &step1[27]);
+  partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+                          &step1[24]);
+
+  // stage 3
+  butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17],
+            &step1[30]);
+  butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18],
+            &step1[29]);
+  butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21],
+            &step1[26]);
+  butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22],
+            &step1[25]);
+
+  idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_34_8x32_ssse3(const __m128i *const in /*in[32]*/,
+                          __m128i *const out /*out[32]*/) {
+  __m128i temp[32];
+
+  idct32_34_8x32_quarter_1_2(in, temp);
+  idct32_34_8x32_quarter_3_4(in, temp);
+  // final stage
+  add_sub_butterfly(temp, out, 32);
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  __m128i io[32], col[32];
+  int i;
+
+  // Load input data. Only need to load the top left 8x8 block.
+  load_transpose_16bit_8x8(input, 32, io);
+  idct32_34_8x32_ssse3(io, col);
+
+  for (i = 0; i < 32; i += 8) {
+    int j;
+    transpose_16bit_8x8(col + i, io);
+    idct32_34_8x32_ssse3(io, io);
+
+    for (j = 0; j < 32; ++j) {
+      write_buffer_8x1(dest + j * stride, io[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+                                             __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  partial_butterfly_ssse3(in[12], -cospi_20_64, cospi_12_64, &step1[5],
+                          &step1[6]);
+
+  // stage 4
+  step2[0] = partial_butterfly_cospi16_ssse3(in[0]);
+  partial_butterfly_ssse3(in[8], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi16(step1[0], step1[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step1[4]);
+  out[4] = _mm_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+                                             __m128i *const out /*out[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                          &step2[15]);
+  partial_butterfly_ssse3(in[14], -cospi_18_64, cospi_14_64, &step2[9],
+                          &step2[14]);
+  partial_butterfly_ssse3(in[10], cospi_22_64, cospi_10_64, &step2[10],
+                          &step2[13]);
+  partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+                          &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm_add_epi16(step2[11], step2[10]);
+  step1[12] = _mm_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm_add_epi16(step2[15], step2[14]);
+
+  idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_135_8x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  idct32_135_8x32_quarter_1(in, temp);
+  idct32_135_8x32_quarter_2(in, temp);
+  // stage 7
+  add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                          &step1[31]);
+  partial_butterfly_ssse3(in[15], -cospi_17_64, cospi_15_64, &step1[17],
+                          &step1[30]);
+  partial_butterfly_ssse3(in[9], cospi_23_64, cospi_9_64, &step1[18],
+                          &step1[29]);
+  partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+                          &step1[28]);
+
+  partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                          &step1[27]);
+  partial_butterfly_ssse3(in[11], -cospi_21_64, cospi_11_64, &step1[21],
+                          &step1[26]);
+
+  partial_butterfly_ssse3(in[13], cospi_19_64, cospi_13_64, &step1[22],
+                          &step1[25]);
+  partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+                          &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi16(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi16(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi16(step1[19], step1[18]);
+  step2[19] = _mm_add_epi16(step1[19], step1[18]);
+  step2[20] = _mm_add_epi16(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi16(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi16(step1[23], step1[22]);
+  step2[23] = _mm_add_epi16(step1[23], step1[22]);
+
+  step2[24] = _mm_add_epi16(step1[24], step1[25]);
+  step2[25] = _mm_sub_epi16(step1[24], step1[25]);
+  step2[26] = _mm_sub_epi16(step1[27], step1[26]);
+  step2[27] = _mm_add_epi16(step1[27], step1[26]);
+  step2[28] = _mm_add_epi16(step1[28], step1[29]);
+  step2[29] = _mm_sub_epi16(step1[28], step1[29]);
+  step2[30] = _mm_sub_epi16(step1[31], step1[30]);
+  step2[31] = _mm_add_epi16(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+            &step1[30]);
+  butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+            &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+            &step1[26]);
+  butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+            &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_135_8x32_ssse3(const __m128i *const in /*in[32]*/,
+                           __m128i *const out /*out[32]*/) {
+  __m128i temp[32];
+  idct32_135_8x32_quarter_1_2(in, temp);
+  idct32_135_8x32_quarter_3_4(in, temp);
+  // final stage
+  add_sub_butterfly(temp, out, 32);
+}
+
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
+                                 int stride) {
+  __m128i col[2][32], io[32];
+  int i;
+
+  // rows
+  for (i = 0; i < 2; i++) {
+    load_transpose_16bit_8x8(&input[0], 32, &io[0]);
+    load_transpose_16bit_8x8(&input[8], 32, &io[8]);
+    idct32_135_8x32_ssse3(io, col[i]);
+    input += 32 << 3;
+  }
+
+  // columns
+  for (i = 0; i < 32; i += 8) {
+    transpose_16bit_8x8(col[0] + i, io);
+    transpose_16bit_8x8(col[1] + i, io + 8);
+    idct32_135_8x32_ssse3(io, io);
+    store_buffer_8x32(io, dest, stride);
+    dest += 8;
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
new file mode 100644
index 0000000000..e785c8eda1
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
@@ -0,0 +1,110 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_INV_TXFM_SSSE3_H_
+#define VPX_DSP_X86_INV_TXFM_SSSE3_H_
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) {
+  const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
+  const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
+  const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
+  const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64));
+  const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64));
+  const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64));
+  const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64));
+  const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64));
+  const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64));
+  const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64));
+  __m128i step1[8], step2[8], tmp[4];
+
+  // pass 1
+
+  transpose_16bit_4x4(io, io);
+  // io[0]: 00 10 20 30  01 11 21 31
+  // io[1]: 02 12 22 32  03 13 23 33
+
+  // stage 1
+  tmp[0] = _mm_unpacklo_epi64(io[0], io[0]);
+  tmp[1] = _mm_unpackhi_epi64(io[0], io[0]);
+  tmp[2] = _mm_unpacklo_epi64(io[1], io[1]);
+  tmp[3] = _mm_unpackhi_epi64(io[1], io[1]);
+  step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d);    // step1 4&7
+  step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d);  // step1 5&6
+
+  // stage 2
+  step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d);  // step2 0&1
+  step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d);     // step2 3&2
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);       // step2 4&7
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);       // step2 5&6
+  step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]);  // step2 6
+
+  // stage 3
+  tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
+  step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]);  // step1 5&6
+  tmp[0] = _mm_add_epi16(step2[0], step2[2]);                      // step1 0&1
+  tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                      // step1 3&2
+  step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                   // step1 2&1
+  step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                   // step1 3&0
+
+  // stage 4
+  tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0
+  tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1
+  tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7
+  tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6
+
+  // pass 2
+
+  idct8x8_12_transpose_16bit_4x8(tmp, io);
+
+  // stage 1
+  step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d);
+  step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d);
+  step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d);
+  step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d);
+
+  // stage 2
+  step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d);  // step2[1] = step2[0]
+  step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d);
+  step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d);
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+
+  // stage 4
+  io[0] = _mm_add_epi16(step1[0], step2[7]);
+  io[1] = _mm_add_epi16(step1[1], step1[6]);
+  io[2] = _mm_add_epi16(step1[2], step1[5]);
+  io[3] = _mm_add_epi16(step1[3], step2[4]);
+  io[4] = _mm_sub_epi16(step1[3], step2[4]);
+  io[5] = _mm_sub_epi16(step1[2], step1[5]);
+  io[6] = _mm_sub_epi16(step1[1], step1[6]);
+  io[7] = _mm_sub_epi16(step1[0], step2[7]);
+}
+
+void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out);
+
+#endif  // VPX_DSP_X86_INV_TXFM_SSSE3_H_
diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
deleted file mode 100644
index 20baf820f6..0000000000
--- a/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ /dev/null
@@ -1,1793 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the inverse transformation. Part
-; of the functions are originally derived from the ffmpeg project.
-; Note that the current version applies to x86 64-bit only.
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-
-pw_m2404x2:  times 8 dw  -2404*2
-pw_m4756x2:  times 8 dw  -4756*2
-pw_m5520x2:  times 8 dw  -5520*2
-pw_m8423x2:  times 8 dw  -8423*2
-pw_m9102x2:  times 8 dw  -9102*2
-pw_m10394x2: times 8 dw -10394*2
-pw_m11003x2: times 8 dw -11003*2
-
-pw_16364x2: times 8 dw 16364*2
-pw_16305x2: times 8 dw 16305*2
-pw_16207x2: times 8 dw 16207*2
-pw_16069x2: times 8 dw 16069*2
-pw_15893x2: times 8 dw 15893*2
-pw_15679x2: times 8 dw 15679*2
-pw_15426x2: times 8 dw 15426*2
-pw_15137x2: times 8 dw 15137*2
-pw_14811x2: times 8 dw 14811*2
-pw_14449x2: times 8 dw 14449*2
-pw_14053x2: times 8 dw 14053*2
-pw_13623x2: times 8 dw 13623*2
-pw_13160x2: times 8 dw 13160*2
-pw_12665x2: times 8 dw 12665*2
-pw_12140x2: times 8 dw 12140*2
-pw__9760x2: times 8 dw  9760*2
-pw__7723x2: times 8 dw  7723*2
-pw__7005x2: times 8 dw  7005*2
-pw__6270x2: times 8 dw  6270*2
-pw__3981x2: times 8 dw  3981*2
-pw__3196x2: times 8 dw  3196*2
-pw__1606x2: times 8 dw  1606*2
-pw___804x2: times 8 dw   804*2
-
-pd_8192:    times 4 dd 8192
-pw_32:      times 8 dw 32
-pw_16:      times 8 dw 16
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
-pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1
-pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
-%endmacro
-
-TRANSFORM_COEFFS    6270, 15137
-TRANSFORM_COEFFS    3196, 16069
-TRANSFORM_COEFFS   13623,  9102
-
-; constants for 32x32_34
-TRANSFORM_COEFFS      804, 16364
-TRANSFORM_COEFFS    15426,  5520
-TRANSFORM_COEFFS     3981, 15893
-TRANSFORM_COEFFS    16207,  2404
-TRANSFORM_COEFFS     1606, 16305
-TRANSFORM_COEFFS    15679,  4756
-TRANSFORM_COEFFS    11585, 11585
-
-; constants for 32x32_1024
-TRANSFORM_COEFFS    12140, 11003
-TRANSFORM_COEFFS     7005, 14811
-TRANSFORM_COEFFS    14053,  8423
-TRANSFORM_COEFFS     9760, 13160
-TRANSFORM_COEFFS    12665, 10394
-TRANSFORM_COEFFS     7723, 14449
-
-%macro PAIR_PP_COEFFS 2
-dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
-%endmacro
-
-%macro PAIR_MP_COEFFS 2
-dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2
-%endmacro
-
-%macro PAIR_MM_COEFFS 2
-dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
-%endmacro
-
-PAIR_PP_COEFFS     30274, 12540
-PAIR_PP_COEFFS      6392, 32138
-PAIR_MP_COEFFS     18204, 27246
-
-PAIR_PP_COEFFS     12540, 12540
-PAIR_PP_COEFFS     30274, 30274
-PAIR_PP_COEFFS      6392,  6392
-PAIR_PP_COEFFS     32138, 32138
-PAIR_MM_COEFFS     18204, 18204
-PAIR_PP_COEFFS     27246, 27246
-
-SECTION .text
-
-%if ARCH_X86_64
-%macro SUM_SUB 3
-  psubw  m%3, m%1, m%2
-  paddw  m%1, m%2
-  SWAP    %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
-  pmaddwd            m%1, m%3, %5
-  pmaddwd            m%2, m%3, %6
-  paddd              m%1,  %4
-  paddd              m%2,  %4
-  psrad              m%1,  14
-  psrad              m%2,  14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_m%3_m%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_m%3_m%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-%macro IDCT8_1D 0
-  SUM_SUB          0,    4,    9
-  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10
-  pmulhrsw        m0,  m12
-  pmulhrsw        m4,  m12
-  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10
-  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10
-
-  SUM_SUB          1,    5,    9
-  SUM_SUB          7,    3,    9
-  SUM_SUB          0,    6,    9
-  SUM_SUB          4,    2,    9
-  SUM_SUB          3,    5,    9
-  pmulhrsw        m3,  m12
-  pmulhrsw        m5,  m12
-
-  SUM_SUB          0,    7,    9
-  SUM_SUB          4,    3,    9
-  SUM_SUB          2,    5,    9
-  SUM_SUB          6,    1,    9
-
-  SWAP             3,    6
-  SWAP             1,    4
-%endmacro
-
-; This macro handles 8 pixels per line
-%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero
-  paddw           m%1, m11
-  paddw           m%2, m11
-  psraw           m%1, 5
-  psraw           m%2, 5
-
-  movh            m%3, [outputq]
-  movh            m%4, [outputq + strideq]
-  punpcklbw       m%3, m%5
-  punpcklbw       m%4, m%5
-  paddw           m%3, m%1
-  paddw           m%4, m%2
-  packuswb        m%3, m%5
-  packuswb        m%4, m%5
-  movh               [outputq], m%3
-  movh     [outputq + strideq], m%4
-%endmacro
-
-INIT_XMM ssse3
-; full inverse 8x8 2D-DCT transform
-cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
-  mova     m8, [pd_8192]
-  mova    m11, [pw_16]
-  mova    m12, [pw_11585x2]
-
-  lea      r3, [2 * strideq]
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova     m0, [inputq +   0]
-  packssdw m0, [inputq +  16]
-  mova     m1, [inputq +  32]
-  packssdw m1, [inputq +  48]
-  mova     m2, [inputq +  64]
-  packssdw m2, [inputq +  80]
-  mova     m3, [inputq +  96]
-  packssdw m3, [inputq + 112]
-  mova     m4, [inputq + 128]
-  packssdw m4, [inputq + 144]
-  mova     m5, [inputq + 160]
-  packssdw m5, [inputq + 176]
-  mova     m6, [inputq + 192]
-  packssdw m6, [inputq + 208]
-  mova     m7, [inputq + 224]
-  packssdw m7, [inputq + 240]
-%else
-  mova     m0, [inputq +   0]
-  mova     m1, [inputq +  16]
-  mova     m2, [inputq +  32]
-  mova     m3, [inputq +  48]
-  mova     m4, [inputq +  64]
-  mova     m5, [inputq +  80]
-  mova     m6, [inputq +  96]
-  mova     m7, [inputq + 112]
-%endif
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-  IDCT8_1D
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-  IDCT8_1D
-
-  pxor    m12, m12
-  ADD_STORE_8P_2X  0, 1, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  2, 3, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  4, 5, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  6, 7, 9, 10, 12
-
-  RET
-
-; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
-cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
-  mova       m8, [pd_8192]
-  mova      m11, [pw_16]
-  mova      m12, [pw_11585x2]
-
-  lea        r3, [2 * strideq]
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova       m0, [inputq +   0]
-  packssdw   m0, [inputq +  16]
-  mova       m1, [inputq +  32]
-  packssdw   m1, [inputq +  48]
-  mova       m2, [inputq +  64]
-  packssdw   m2, [inputq +  80]
-  mova       m3, [inputq +  96]
-  packssdw   m3, [inputq + 112]
-%else
-  mova       m0, [inputq +  0]
-  mova       m1, [inputq + 16]
-  mova       m2, [inputq + 32]
-  mova       m3, [inputq + 48]
-%endif
-
-  punpcklwd  m0, m1
-  punpcklwd  m2, m3
-  punpckhdq  m9, m0, m2
-  punpckldq  m0, m2
-  SWAP       2, 9
-
-  ; m0 -> [0], [0]
-  ; m1 -> [1], [1]
-  ; m2 -> [2], [2]
-  ; m3 -> [3], [3]
-  punpckhqdq m10, m0, m0
-  punpcklqdq m0,  m0
-  punpckhqdq m9,  m2, m2
-  punpcklqdq m2,  m2
-  SWAP       1, 10
-  SWAP       3,  9
-
-  pmulhrsw   m0, m12
-  pmulhrsw   m2, [dpw_30274_12540]
-  pmulhrsw   m1, [dpw_6392_32138]
-  pmulhrsw   m3, [dpw_m18204_27246]
-
-  SUM_SUB    0, 2, 9
-  SUM_SUB    1, 3, 9
-
-  punpcklqdq m9, m3, m3
-  punpckhqdq m5, m3, m9
-
-  SUM_SUB    3, 5, 9
-  punpckhqdq m5, m3
-  pmulhrsw   m5, m12
-
-  punpckhqdq m9, m1, m5
-  punpcklqdq m1, m5
-  SWAP       5, 9
-
-  SUM_SUB    0, 5, 9
-  SUM_SUB    2, 1, 9
-
-  punpckhqdq m3, m0, m0
-  punpckhqdq m4, m1, m1
-  punpckhqdq m6, m5, m5
-  punpckhqdq m7, m2, m2
-
-  punpcklwd  m0, m3
-  punpcklwd  m7, m2
-  punpcklwd  m1, m4
-  punpcklwd  m6, m5
-
-  punpckhdq  m4, m0, m7
-  punpckldq  m0, m7
-  punpckhdq  m10, m1, m6
-  punpckldq  m5, m1, m6
-
-  punpckhqdq m1, m0, m5
-  punpcklqdq m0, m5
-  punpckhqdq m3, m4, m10
-  punpcklqdq m2, m4, m10
-
-
-  pmulhrsw   m0, m12
-  pmulhrsw   m6, m2, [dpw_30274_30274]
-  pmulhrsw   m4, m2, [dpw_12540_12540]
-
-  pmulhrsw   m7, m1, [dpw_32138_32138]
-  pmulhrsw   m1, [dpw_6392_6392]
-  pmulhrsw   m5, m3, [dpw_m18204_m18204]
-  pmulhrsw   m3, [dpw_27246_27246]
-
-  mova       m2, m0
-  SUM_SUB    0, 6, 9
-  SUM_SUB    2, 4, 9
-  SUM_SUB    1, 5, 9
-  SUM_SUB    7, 3, 9
-
-  SUM_SUB    3, 5, 9
-  pmulhrsw   m3, m12
-  pmulhrsw   m5, m12
-
-  SUM_SUB    0, 7, 9
-  SUM_SUB    2, 3, 9
-  SUM_SUB    4, 5, 9
-  SUM_SUB    6, 1, 9
-
-  SWAP       3, 6
-  SWAP       1, 2
-  SWAP       2, 4
-
-
-  pxor    m12, m12
-  ADD_STORE_8P_2X  0, 1, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  2, 3, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  4, 5, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  6, 7, 9, 10, 12
-
-  RET
-
-%define  idx0 16 * 0
-%define  idx1 16 * 1
-%define  idx2 16 * 2
-%define  idx3 16 * 3
-%define  idx4 16 * 4
-%define  idx5 16 * 5
-%define  idx6 16 * 6
-%define  idx7 16 * 7
-%define  idx8 16 * 0
-%define  idx9 16 * 1
-%define idx10 16 * 2
-%define idx11 16 * 3
-%define idx12 16 * 4
-%define idx13 16 * 5
-%define idx14 16 * 6
-%define idx15 16 * 7
-%define idx16 16 * 0
-%define idx17 16 * 1
-%define idx18 16 * 2
-%define idx19 16 * 3
-%define idx20 16 * 4
-%define idx21 16 * 5
-%define idx22 16 * 6
-%define idx23 16 * 7
-%define idx24 16 * 0
-%define idx25 16 * 1
-%define idx26 16 * 2
-%define idx27 16 * 3
-%define idx28 16 * 4
-%define idx29 16 * 5
-%define idx30 16 * 6
-%define idx31 16 * 7
-
-; FROM idct32x32_add_neon.asm
-;
-; Instead of doing the transforms stage by stage, it is done by loading
-; some input values and doing as many stages as possible to minimize the
-; storing/loading of intermediate results. To fit within registers, the
-; final coefficients are cut into four blocks:
-; BLOCK A: 16-19,28-31
-; BLOCK B: 20-23,24-27
-; BLOCK C: 8-11,12-15
-; BLOCK D: 0-3,4-7
-; Blocks A and C are straight calculation through the various stages. In
-; block B, further calculations are performed using the results from
-; block A. In block D, further calculations are performed using the results
-; from block C and then the final calculations are done using results from
-; block A and B which have been combined at the end of block B.
-;
-
-%macro IDCT32X32_34 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, m1
-  pmulhrsw             m1, [pw___804x2] ; stp1_16
-  mova      [r4 +      0], m0
-  pmulhrsw            m11, [pw_16364x2] ; stp2_31
-  mova      [r4 + 16 * 2], m2
-  mova                m12, m7
-  pmulhrsw             m7, [pw_15426x2] ; stp1_28
-  mova      [r4 + 16 * 4], m4
-  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
-  mova      [r4 + 16 * 6], m6
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m2, m1   ; stp1_16
-  mova                 m0, m11  ; stp1_31
-  mova                 m4, m7   ; stp1_28
-  mova                m15, m12  ; stp1_19
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,    15,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0, 15, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,    15,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m6, m5
-  pmulhrsw             m5, [pw__3981x2] ; stp1_20
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m15
-  pmulhrsw             m6, [pw_15893x2] ; stp2_27
-  mova [stp + %4 + idx30], m2
-  mova                 m2, m3
-  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
-  mova [stp + %4 + idx31], m11
-  pmulhrsw             m2, [pw_16207x2] ; stp2_24
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m13, m5 ; stp1_20
-  mova                m14, m6 ; stp1_27
-  mova                m15, m3 ; stp1_23
-  mova                m11, m2 ; stp1_24
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm       11,    15,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB              15, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB              11, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               0, 15, 9 ; stp2_17, stp2_22
-  SUM_SUB               4, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB               7,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m10, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              10, 11, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m10
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5, 9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB              11, 15,  9
-  pmulhrsw            m11, m10  ; stp1_25
-  pmulhrsw            m15, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_24
-  pmulhrsw             m3, m10  ; stp1_23
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP 6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X         11,    15,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP 11, 15
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP 2, 3
-%endif
-
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m11
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__1606x2] ; stp1_8
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  pmulhrsw             m1, [pw_16305x2] ; stp2_15
-  mova [stp + %3 + idx22], m15
-  mova                 m7, m6
-  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
-  mova [stp + %3 + idx23], m3
-  pmulhrsw             m6, [pw_15679x2] ; stp1_12
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m3, m0 ; stp1_8
-  mova                 m2, m1 ; stp1_15
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  mova                 m4, m7 ; stp1_11
-  mova                 m5, m6 ; stp1_12
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,  4, 9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,  7, 9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X          5,     4,  11585, 11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP 5, 4
-  BUTTERFLY_4X          6,     7,  11585, 11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP 6, 7
-%endif
-
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, m11
-  pmulhrsw            m11, [pw__3196x2] ; stp1_4
-  pmulhrsw            m12, [pw_16069x2] ; stp1_7
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                m10, [pw_11585x2]
-  pmulhrsw             m0, m10  ; stp1_1
-
-  mova                m14, m11 ; stp1_4
-  mova                m13, m12 ; stp1_7
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  mova                 m7, m0 ; stp1_0 = stp1_1
-  mova                 m4, m0 ; stp1_1
-  mova                 m2, m7 ; stp1_0
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               7, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               4, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  1, 9 ;  stp1_0, stp1_15
-  SUM_SUB               7,  3, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               4,  6, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m15, [stp + %4 + idx30]
-  mova                m10, [stp + %4 + idx31]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               7, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m7
-  mova [stp + %4 + idx30], m15
-  mova [stp + %4 + idx31], m10
-  mova                 m7, [stp + %4 + idx28]
-  mova                 m0, [stp + %4 + idx29]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               4,  7, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m4
-  mova [stp + %4 + idx28], m7
-  mova [stp + %4 + idx29], m0
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m4, [stp + %3 + idx19]
-  SUM_SUB               1,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               3,  7, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               6,  4, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m4
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m0, [stp + %4 + idx27]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m3, [stp + %4 + idx24]
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  mova [stp + %4 + idx27], m0
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx24], m3
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-%macro RECON_AND_STORE 1
-  mova            m11, [pw_32]
-  lea             stp, [rsp + %1]
-  mov              r6, 32
-  pxor             m8, m8
-%%recon_and_store:
-  mova             m0, [stp + 16 * 32 * 0]
-  mova             m1, [stp + 16 * 32 * 1]
-  mova             m2, [stp + 16 * 32 * 2]
-  mova             m3, [stp + 16 * 32 * 3]
-  add             stp, 16
-
-  paddw            m0, m11
-  paddw            m1, m11
-  paddw            m2, m11
-  paddw            m3, m11
-  psraw            m0, 6
-  psraw            m1, 6
-  psraw            m2, 6
-  psraw            m3, 6
-  movh             m4, [outputq +  0]
-  movh             m5, [outputq +  8]
-  movh             m6, [outputq + 16]
-  movh             m7, [outputq + 24]
-  punpcklbw        m4, m8
-  punpcklbw        m5, m8
-  punpcklbw        m6, m8
-  punpcklbw        m7, m8
-  paddw            m0, m4
-  paddw            m1, m5
-  paddw            m2, m6
-  paddw            m3, m7
-  packuswb         m0, m1
-  packuswb         m2, m3
-  mova [outputq +  0], m0
-  mova [outputq + 16], m2
-  lea         outputq, [outputq + strideq]
-  dec              r6
-  jnz %%recon_and_store
-%endmacro
-
-%define i32x32_size     16*32*5
-%define pass_two_start  16*32*0
-%define transposed_in   16*32*4
-%define pass_one_start  16*32*0
-%define stp r8
-
-INIT_XMM ssse3
-cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride
-  mova            m8, [pd_8192]
-  lea            stp, [rsp + pass_one_start]
-
-idct32x32_34:
-  mov             r3, inputq
-  lea             r4, [rsp + transposed_in]
-
-idct32x32_34_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  IDCT32X32_34  16*0, 16*32, 16*64, 16*96
-  lea            stp, [stp + 16 * 8]
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-  lea             r9, [rsp + pass_one_start]
-
-idct32x32_34_2:
-  lea             r4, [rsp + transposed_in]
-  mov             r3, r9
-
-idct32x32_34_transpose_2:
-  mova            m0, [r3 +      0]
-  mova            m1, [r3 + 16 * 1]
-  mova            m2, [r3 + 16 * 2]
-  mova            m3, [r3 + 16 * 3]
-  mova            m4, [r3 + 16 * 4]
-  mova            m5, [r3 + 16 * 5]
-  mova            m6, [r3 + 16 * 6]
-  mova            m7, [r3 + 16 * 7]
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  IDCT32X32_34  16*0, 16*8, 16*16, 16*24
-
-  lea            stp, [stp + 16 * 32]
-  add             r9, 16 * 32
-  dec             r6
-  jnz idct32x32_34_2
-
-  RECON_AND_STORE pass_two_start
-
-  RET
-
-%macro IDCT32X32_135 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m1, [rsp + transposed_in + 16 *  1]
-  mova                m11, m1
-  pmulhrsw             m1, [pw___804x2] ; stp1_16
-  pmulhrsw            m11, [pw_16364x2] ; stp2_31
-
-  mova                 m7, [rsp + transposed_in + 16 *  7]
-  mova                m12, m7
-  pmulhrsw             m7, [pw_15426x2] ; stp1_28
-  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
-
-  mova                 m3, [rsp + transposed_in + 16 *  9]
-  mova                 m4, m3
-  pmulhrsw             m3, [pw__7005x2] ; stp1_18
-  pmulhrsw             m4, [pw_14811x2] ; stp2_29
-
-  mova                 m0, [rsp + transposed_in + 16 * 15]
-  mova                 m2, m0
-  pmulhrsw             m0, [pw_12140x2]  ; stp1_30
-  pmulhrsw             m2, [pw_m11003x2] ; stp2_17
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
-  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
-  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
-  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m3
-  mova [stp + %4 + idx30], m2
-  mova [stp + %4 + idx31], m11
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m2, [rsp + transposed_in + 16 *  3]
-  mova                 m3, m2
-  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
-  pmulhrsw             m2, [pw_16207x2] ; stp2_24
-
-  mova                 m5, [rsp + transposed_in + 16 *  5]
-  mova                 m6, m5
-  pmulhrsw             m5, [pw__3981x2] ; stp1_20
-  pmulhrsw             m6, [pw_15893x2] ; stp2_27
-
-  mova                m14, [rsp + transposed_in + 16 * 11]
-  mova                m13, m14
-  pmulhrsw            m13, [pw_m8423x2] ; stp1_21
-  pmulhrsw            m14, [pw_14053x2] ; stp2_26
-
-  mova                 m0, [rsp + transposed_in + 16 * 13]
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__9760x2] ; stp1_22
-  pmulhrsw             m1, [pw_13160x2] ; stp2_25
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
-  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
-  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
-  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                m11, [stp + %3 + idx18]
-  mova                m12, [stp + %3 + idx19]
-  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
-  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m4
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m11
-  mova [stp + %3 + idx19], m12
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m11, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m11
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5,  9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB               1,  0,  9
-  pmulhrsw             m1, m10  ; stp1_25
-  pmulhrsw             m0, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_25
-  pmulhrsw             m3, m10  ; stp1_22
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP  6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP  1, 0
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP  2, 3
-%endif
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  mova [stp + %3 + idx22], m0
-  mova [stp + %3 + idx23], m3
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m1
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__1606x2] ; stp1_8
-  pmulhrsw             m1, [pw_16305x2] ; stp2_15
-
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-  mova                 m7, m6
-  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
-  pmulhrsw             m6, [pw_15679x2] ; stp1_12
-
-  mova                 m4, [rsp + transposed_in + 16 * 10]
-  mova                 m5, m4
-  pmulhrsw             m4, [pw__7723x2] ; stp1_10
-  pmulhrsw             m5, [pw_14449x2] ; stp2_13
-
-  mova                 m2, [rsp + transposed_in + 16 * 14]
-  mova                 m3, m2
-  pmulhrsw             m3, [pw_m10394x2] ; stp1_9
-  pmulhrsw             m2, [pw_12665x2] ; stp2_14
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
-  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
-  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
-  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,    4,  9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,    7,  9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP  5, 4
-  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP  6, 7
-%endif
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, m11
-  pmulhrsw            m11, [pw__3196x2] ; stp1_4
-  pmulhrsw            m12, [pw_16069x2] ; stp1_7
-
-  mova                m13, [rsp + transposed_in + 16 * 12]
-  mova                m14, m13
-  pmulhrsw            m13, [pw_13623x2] ; stp1_6
-  pmulhrsw            m14, [pw_m9102x2] ; stp1_5
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                 m2, [rsp + transposed_in + 16 *  8]
-  pmulhrsw             m0, [pw_11585x2]  ; stp1_1
-  mova                 m3, m2
-  pmulhrsw             m2, [pw__6270x2]  ; stp1_2
-  pmulhrsw             m3, [pw_15137x2]  ; stp1_3
-
-  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
-  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  mova                 m1, m0    ; stp1_0 = stp1_1
-  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
-  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %2 + idx12]
-  mova                 m5, [stp + %2 + idx13]
-  mova                 m6, [stp + %2 + idx14]
-  mova                 m7, [stp + %2 + idx15]
-  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
-  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m10, [stp + %4 + idx31]
-  mova                m15, [stp + %4 + idx30]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m1
-  mova [stp + %4 + idx31], m10
-  mova [stp + %4 + idx30], m15
-  mova                 m0, [stp + %4 + idx29]
-  mova                 m1, [stp + %4 + idx28]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m3
-  mova [stp + %4 + idx29], m0
-  mova [stp + %4 + idx28], m1
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m1, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m3, [stp + %3 + idx19]
-  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m4
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m6
-  mova [stp + %2 + idx15], m7
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m1
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m3
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m3, [stp + %4 + idx24]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m0, [stp + %4 + idx27]
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  mova [stp + %4 + idx24], m3
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx27], m0
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
-  mova            m8, [pd_8192]
-  mov             r6, 2
-  lea            stp, [rsp + pass_one_start]
-
-idct32x32_135:
-  mov             r3, inputq
-  lea             r4, [rsp + transposed_in]
-  mov             r7, 2
-
-idct32x32_135_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  add             r3, 32
-%else
-  add             r3, 16
-%endif
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_135_transpose
-
-  IDCT32X32_135 16*0, 16*32, 16*64, 16*96
-  lea            stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea         inputq, [inputq + 32 * 32]
-%else
-  lea         inputq, [inputq + 16 * 32]
-%endif
-  dec             r6
-  jnz idct32x32_135
-
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-  lea             r9, [rsp + pass_one_start]
-
-idct32x32_135_2:
-  lea             r4, [rsp + transposed_in]
-  mov             r3, r9
-  mov             r7, 2
-
-idct32x32_135_transpose_2:
-  mova            m0, [r3 +      0]
-  mova            m1, [r3 + 16 * 1]
-  mova            m2, [r3 + 16 * 2]
-  mova            m3, [r3 + 16 * 3]
-  mova            m4, [r3 + 16 * 4]
-  mova            m5, [r3 + 16 * 5]
-  mova            m6, [r3 + 16 * 6]
-  mova            m7, [r3 + 16 * 7]
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-
-  add             r3, 16 * 8
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_135_transpose_2
-
-  IDCT32X32_135 16*0, 16*8, 16*16, 16*24
-
-  lea            stp, [stp + 16 * 32]
-  add             r9, 16 * 32
-  dec             r6
-  jnz idct32x32_135_2
-
-  RECON_AND_STORE pass_two_start
-
-  RET
-
-%macro IDCT32X32_1024 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m1, [rsp + transposed_in + 16 *  1]
-  mova                m11, [rsp + transposed_in + 16 * 31]
-  BUTTERFLY_4X          1,    11,    804, 16364,  m8,  9,  10 ; stp1_16, stp1_31
-
-  mova                 m0, [rsp + transposed_in + 16 * 15]
-  mova                 m2, [rsp + transposed_in + 16 * 17]
-  BUTTERFLY_4X          2,     0,  12140, 11003,  m8,  9,  10 ; stp1_17, stp1_30
-
-  mova                 m7, [rsp + transposed_in + 16 *  7]
-  mova                m12, [rsp + transposed_in + 16 * 25]
-  BUTTERFLY_4X         12,     7,  15426,  5520,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova                 m3, [rsp + transposed_in + 16 *  9]
-  mova                 m4, [rsp + transposed_in + 16 * 23]
-  BUTTERFLY_4X          3,     4,   7005, 14811,  m8,  9,  10 ; stp1_18, stp1_29
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
-  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
-  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
-  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m3
-  mova [stp + %4 + idx30], m2
-  mova [stp + %4 + idx31], m11
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m5, [rsp + transposed_in + 16 *  5]
-  mova                 m6, [rsp + transposed_in + 16 * 27]
-  BUTTERFLY_4X          5,     6,   3981, 15893,  m8,  9,  10 ; stp1_20, stp1_27
-
-  mova                m13, [rsp + transposed_in + 16 * 21]
-  mova                m14, [rsp + transposed_in + 16 * 11]
-  BUTTERFLY_4X         13,    14,  14053,  8423,  m8,  9,  10 ; stp1_21, stp1_26
-
-  mova                 m0, [rsp + transposed_in + 16 * 13]
-  mova                 m1, [rsp + transposed_in + 16 * 19]
-  BUTTERFLY_4X          0,     1,   9760, 13160,  m8,  9,  10 ; stp1_22, stp1_25
-
-  mova                 m2, [rsp + transposed_in + 16 *  3]
-  mova                 m3, [rsp + transposed_in + 16 * 29]
-  BUTTERFLY_4X          3,     2,  16207,  2404,  m8,  9,  10 ; stp1_23, stp1_24
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
-  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
-  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
-  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                m11, [stp + %3 + idx18]
-  mova                m12, [stp + %3 + idx19]
-  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
-  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m4
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m11
-  mova [stp + %3 + idx19], m12
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m11, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m11
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5,  9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB               1,  0,  9
-  pmulhrsw             m1, m10  ; stp1_25
-  pmulhrsw             m0, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_25
-  pmulhrsw             m3, m10  ; stp1_22
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP  6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP  1, 0
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP  2, 3
-%endif
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  mova [stp + %3 + idx22], m0
-  mova [stp + %3 + idx23], m3
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m1
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m1, [rsp + transposed_in + 16 * 30]
-  BUTTERFLY_4X          0,     1,   1606, 16305,  m8,  9,  10 ; stp1_8, stp1_15
-
-  mova                 m2, [rsp + transposed_in + 16 * 14]
-  mova                 m3, [rsp + transposed_in + 16 * 18]
-  BUTTERFLY_4X          3,     2,  12665, 10394,  m8,  9,  10 ; stp1_9, stp1_14
-
-  mova                 m4, [rsp + transposed_in + 16 * 10]
-  mova                 m5, [rsp + transposed_in + 16 * 22]
-  BUTTERFLY_4X          4,     5,   7723, 14449,  m8,  9,  10 ; stp1_10, stp1_13
-
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-  mova                 m7, [rsp + transposed_in + 16 * 26]
-  BUTTERFLY_4X          7,     6,  15679,  4756,  m8,  9,  10 ; stp1_11, stp1_12
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
-  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
-  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
-  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,    4,  9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,    7,  9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP  5, 4
-  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP  6, 7
-%endif
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, [rsp + transposed_in + 16 * 28]
-  BUTTERFLY_4X         11,    12,   3196, 16069,  m8,  9,  10 ; stp1_4, stp1_7
-
-  mova                m13, [rsp + transposed_in + 16 * 12]
-  mova                m14, [rsp + transposed_in + 16 * 20]
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_5, stp1_6
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                 m1, [rsp + transposed_in + 16 * 16]
-
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               0,    1,  9
-  pmulhrsw             m0, m10  ; stp1_1
-  pmulhrsw             m1, m10  ; stp1_0
-%else
-  BUTTERFLY_4X          0,     1,  11585, 11585,  m8,  9,  10 ; stp1_1, stp1_0
-  SWAP  0, 1
-%endif
-  mova                 m2, [rsp + transposed_in + 16 *  8]
-  mova                 m3, [rsp + transposed_in + 16 * 24]
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_2, stp1_3
-
-  mova                m10, [pw_11585x2]
-  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
-  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
-  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %2 + idx12]
-  mova                 m5, [stp + %2 + idx13]
-  mova                 m6, [stp + %2 + idx14]
-  mova                 m7, [stp + %2 + idx15]
-  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
-  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m10, [stp + %4 + idx31]
-  mova                m15, [stp + %4 + idx30]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m1
-  mova [stp + %4 + idx31], m10
-  mova [stp + %4 + idx30], m15
-  mova                 m0, [stp + %4 + idx29]
-  mova                 m1, [stp + %4 + idx28]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m3
-  mova [stp + %4 + idx29], m0
-  mova [stp + %4 + idx28], m1
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m1, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m3, [stp + %3 + idx19]
-  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m4
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m6
-  mova [stp + %2 + idx15], m7
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m1
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m3
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m3, [stp + %4 + idx24]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m0, [stp + %4 + idx27]
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  mova [stp + %4 + idx24], m3
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx27], m0
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
-  mova            m8, [pd_8192]
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-
-idct32x32_1024:
-  mov             r3, inputq
-  lea             r4, [rsp + transposed_in]
-  mov             r7, 4
-
-idct32x32_1024_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-%if CONFIG_VP9_HIGHBITDEPTH
-  add             r3, 32
-%else
-  add             r3, 16
-%endif
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_1024_transpose
-
-  IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
-
-  lea            stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea         inputq, [inputq + 32 * 32]
-%else
-  lea         inputq, [inputq + 16 * 32]
-%endif
-  dec             r6
-  jnz idct32x32_1024
-
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-  lea             r9, [rsp + pass_one_start]
-
-idct32x32_1024_2:
-  lea             r4, [rsp + transposed_in]
-  mov             r3, r9
-  mov             r7, 4
-
-idct32x32_1024_transpose_2:
-  mova            m0, [r3 +      0]
-  mova            m1, [r3 + 16 * 1]
-  mova            m2, [r3 + 16 * 2]
-  mova            m3, [r3 + 16 * 3]
-  mova            m4, [r3 + 16 * 4]
-  mova            m5, [r3 + 16 * 5]
-  mova            m6, [r3 + 16 * 6]
-  mova            m7, [r3 + 16 * 7]
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-
-  add             r3, 16 * 8
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_1024_transpose_2
-
-  IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
-
-  lea            stp, [stp + 16 * 32]
-  add             r9, 16 * 32
-  dec             r6
-  jnz idct32x32_1024_2
-
-  RECON_AND_STORE pass_two_start
-
-  RET
-%endif
diff --git a/libs/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/libs/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
index fbbcd76bd7..bcf1a6ef98 100644
--- a/libs/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
@@ -9,6 +9,7 @@
 ;
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION .text
 
@@ -82,15 +83,8 @@ SECTION .text
 
 INIT_XMM sse2
 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0,        [inputq +  0]
-  packssdw        m0,        [inputq + 16]
-  mova            m1,        [inputq + 32]
-  packssdw        m1,        [inputq + 48]
-%else
-  mova            m0,        [inputq +  0]
-  mova            m1,        [inputq + 16]
-%endif
+  LOAD_TRAN_LOW    0, inputq, 0
+  LOAD_TRAN_LOW    1, inputq, 8
   psraw           m0,        2
   psraw           m1,        2
 
diff --git a/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c
index 85923b4478..6652a62dcf 100644
--- a/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c
+++ b/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c
@@ -13,10 +13,10 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
-void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p,
-                                    const unsigned char *_blimit,
-                                    const unsigned char *_limit,
-                                    const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
+                                const unsigned char *_blimit,
+                                const unsigned char *_limit,
+                                const unsigned char *_thresh) {
   __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
@@ -367,7 +367,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
 };
 
-void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,
+void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
                                      const unsigned char *_blimit,
                                      const unsigned char *_limit,
                                      const unsigned char *_thresh) {
diff --git a/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c
index e13334ae0e..28e6fd65f9 100644
--- a/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c
@@ -229,10 +229,10 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
   *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
 }
 
-void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit,
-                                    const unsigned char *_limit,
-                                    const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
+                                const unsigned char *_blimit,
+                                const unsigned char *_limit,
+                                const unsigned char *_thresh) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
@@ -591,7 +591,7 @@ static INLINE __m128i filter16_mask(const __m128i *const flat,
   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
 }
 
-void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
+void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
                                      const unsigned char *_blimit,
                                      const unsigned char *_limit,
                                      const unsigned char *_thresh) {
@@ -1745,7 +1745,7 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
   transpose(src, p, dst, 8, 2);
 
   // Loop filtering
-  vpx_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
+  vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
 
   src[0] = t_dst;
   src[1] = t_dst + 8 * 8;
@@ -1766,7 +1766,7 @@ void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
   transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
 
   // Loop filtering
-  vpx_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
+  vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
 
   // Transpose back
   transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
diff --git a/libs/libvpx/vpx_dsp/x86/mem_sse2.h b/libs/libvpx/vpx_dsp/x86/mem_sse2.h
new file mode 100644
index 0000000000..2ce738fb77
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/mem_sse2.h
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_MEM_SSE2_H_
+#define VPX_DSP_X86_MEM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+
+static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
+  d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
+  d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
+  d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  load_8bit_4x4(s + 0 * stride, stride, &d[0]);
+  load_8bit_4x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  load_8bit_8x4(s + 0 * stride, stride, &d[0]);
+  load_8bit_8x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_16x8(const uint8_t *const s,
+                                  const ptrdiff_t stride, __m128i *const d) {
+  d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
+  d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
+  d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
+  d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
+  d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
+}
+
+static INLINE void loadu_8bit_16x4(const uint8_t *const s,
+                                   const ptrdiff_t stride, __m128i *const d) {
+  d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void loadu_8bit_16x8(const uint8_t *const s,
+                                   const ptrdiff_t stride, __m128i *const d) {
+  loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
+  loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
+  _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
+}
+
+static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
+                                  const ptrdiff_t stride) {
+  *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
+  *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
+  *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
+  *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
+}
+
+static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
+                                       const ptrdiff_t stride) {
+  __m128i ss[4];
+
+  ss[0] = s;
+  ss[1] = _mm_srli_si128(s, 4);
+  ss[2] = _mm_srli_si128(s, 8);
+  ss[3] = _mm_srli_si128(s, 12);
+  store_8bit_4x4(ss, d, stride);
+}
+
+static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
+                                            uint8_t *const d,
+                                            const ptrdiff_t stride) {
+  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
+  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
+  _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
+}
+
+static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
+                                  const ptrdiff_t stride) {
+  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
+  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
+  _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
+  _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
+  _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
+  _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
+  _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
+}
+
+static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+                                    const ptrdiff_t stride) {
+  _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
+  _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
+  _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
+}
+
+#endif  // VPX_DSP_X86_MEM_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/quantize_avx.c b/libs/libvpx/vpx_dsp/x86/quantize_avx.c
new file mode 100644
index 0000000000..6f4489004d
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/quantize_avx.c
@@ -0,0 +1,315 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_x86.h"
+
+void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        int skip_block, const int16_t *zbin_ptr,
+                        const int16_t *round_ptr, const int16_t *quant_ptr,
+                        const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                        uint16_t *eob_ptr, const int16_t *scan_ptr,
+                        const int16_t *iscan_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m256i big_zero = _mm256_setzero_si256();
+  int index;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan_ptr;
+  (void)skip_block;
+  assert(!skip_block);
+
+  *eob_ptr = 0;
+
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_test_all_zeros(all_zero, all_zero)) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (n_coeffs == 16) return;
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_tran_low(coeff0, dqcoeff_ptr);
+    store_tran_low(coeff1, dqcoeff_ptr + 8);
+
+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
+                       zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < n_coeffs; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_test_all_zeros(all_zero, all_zero)) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      continue;
+    }
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_tran_low(coeff0, dqcoeff_ptr + index);
+    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+                        index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_avx(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m256i big_zero = _mm256_setzero_si256();
+  int index;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan_ptr;
+  (void)n_coeffs;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // Setup global values.
+  // The 32x32 halves zbin and round.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, one);
+  zbin = _mm_srli_epi16(zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  zbin = _mm_sub_epi16(zbin, one);
+
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  round = _mm_add_epi16(round, one);
+  round = _mm_srli_epi16(round, 1);
+
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+  shift = _mm_slli_epi16(shift, 1);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC.
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_test_all_zeros(all_zero, all_zero)) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs.
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs.
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    // Un-sign to bias rounding like C.
+    // dequant is almost always negative, so this is probably the backwards way
+    // to handle the sign. However, it matches the previous assembly.
+    coeff0 = _mm_abs_epi16(qcoeff0);
+    coeff1 = _mm_abs_epi16(qcoeff1);
+
+    coeff0 = calculate_dqcoeff(coeff0, dequant);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = calculate_dqcoeff(coeff1, dequant);
+
+    // "Divide" by 2.
+    coeff0 = _mm_srli_epi16(coeff0, 1);
+    coeff1 = _mm_srli_epi16(coeff1, 1);
+
+    coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
+    coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
+
+    store_tran_low(coeff0, dqcoeff_ptr);
+    store_tran_low(coeff1, dqcoeff_ptr + 8);
+
+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
+                       zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < 32 * 32; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_test_all_zeros(all_zero, all_zero)) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      continue;
+    }
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = _mm_abs_epi16(qcoeff0);
+    coeff1 = _mm_abs_epi16(qcoeff1);
+
+    coeff0 = calculate_dqcoeff(coeff0, dequant);
+    coeff1 = calculate_dqcoeff(coeff1, dequant);
+
+    coeff0 = _mm_srli_epi16(coeff0, 1);
+    coeff1 = _mm_srli_epi16(coeff1, 1);
+
+    coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
+    coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
+
+    store_tran_low(coeff0, dqcoeff_ptr + index);
+    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+                        index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm b/libs/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm
deleted file mode 100644
index 01c41291be..0000000000
--- a/libs/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm
+++ /dev/null
@@ -1,544 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-
-  vzeroupper
-
-  ; If we can skip this block, then just zero the output
-  cmp                         skipmp, 0
-  jne .blank
-
-%ifnidn %1, b_32x32
-
-  ; Special case for ncoeff == 16, as it is frequent and we can save on
-  ; not setting up a loop.
-  cmp                       ncoeffmp, 16
-  jne .generic
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Special case of ncoeff == 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.single:
-
-  movifnidn                   coeffq, coeffmp
-  movifnidn                    zbinq, zbinmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-
-  ; Get DC and first 15 AC coeffs - in this special case, that is all.
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; coeff stored as 32bit numbers but we process them as 16 bit numbers
-  mova                            m9, [coeffq]
-  packssdw                        m9, [coeffq+16]          ; m9 = c[i]
-  mova                           m10, [coeffq+32]
-  packssdw                       m10, [coeffq+48]          ; m10 = c[i]
-%else
-  mova                            m9, [coeffq]             ; m9 = c[i]
-  mova                           m10, [coeffq+16]          ; m10 = c[i]
-%endif
-
-  mov                             r0, eobmp                ; Output pointer
-  mov                             r1, qcoeffmp             ; Output pointer
-  mov                             r2, dqcoeffmp            ; Output pointer
-
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  pcmpeqw                         m4, m4                   ; All word lanes -1
-  paddw                           m0, m4                   ; m0 = zbin - 1
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, we just write zeros
-  ; to the outputs and we are done.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .single_nonzero
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova                       [r1   ], ymm5
-  mova                       [r1+32], ymm5
-  mova                       [r2   ], ymm5
-  mova                       [r2+32], ymm5
-%else
-  mova                          [r1], ymm5
-  mova                          [r2], ymm5
-%endif
-  mov                           [r0], word 0
-
-  vzeroupper
-  RET
-
-.single_nonzero:
-
-  ; Actual quantization of size 16 block - setup pointers, rounders, etc.
-  movifnidn                       r4, roundmp
-  movifnidn                       r5, quantmp
-  mov                             r3, dequantmp
-  mov                             r6, shiftmp
-  mova                            m1, [r4]              ; m1 = round
-  mova                            m2, [r5]              ; m2 = quant
-  mova                            m3, [r3]              ; m3 = dequant
-  mova                            m4, [r6]              ; m4 = shift
-
-  mov                             r3, iscanmp
-
-  DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova                  [qcoeffq   ], m11
-  mova                  [qcoeffq+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova                  [qcoeffq+32], m11
-  mova                  [qcoeffq+48], m6
-%else
-  mova                  [qcoeffq   ], m8
-  mova                  [qcoeffq+16], m13
-%endif
-
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova                 [dqcoeffq   ], m11
-  mova                 [dqcoeffq+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova                 [dqcoeffq+32], m11
-  mova                 [dqcoeffq+48], m6
-%else
-  mova                 [dqcoeffq   ], m8
-  mova                 [dqcoeffq+16], m13
-%endif
-
-  mova                            m6, [iscanq]            ; m6 = scan[i]
-  mova                           m11, [iscanq+16]         ; m11 = scan[i]
-
-  pcmpeqw                         m8,  m8,  m5            ; m8 = c[i] == 0
-  pcmpeqw                        m13, m13,  m5            ; m13 = c[i] == 0
-  psubw                           m6,  m6,  m7            ; m6 = scan[i] + 1
-  psubw                          m11, m11, m12            ; m11 = scan[i] + 1
-  pandn                           m8,  m8,  m6            ; m8 = max(eob)
-  pandn                          m13, m13, m11            ; m13 = max(eob)
-  pmaxsw                          m8,  m8, m13
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                         [eobq], ax
-
-  vzeroupper
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of ncoeff != 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.generic:
-
-%endif ; %ifnidn %1, b_32x32
-
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
-            qcoeff, dqcoeff, dequant, eob, scan, iscan
-
-  ; Actual quantization loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-  mova                            m3, [r2]                 ; m3 = dequant
-  pcmpeqw                         m4, m4                   ; All lanes -1
-%ifidn %1, b_32x32
-  psubw                           m0, m4
-  psubw                           m1, m4
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  paddw                           m0, m4                   ; m0 = m0 + 1
-
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]                 ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea                         coeffq, [  coeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-%else
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-%endif
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; coeff stored as 32bit numbers & require 16bit numbers
-  mova                            m9, [coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [coeffq+ncoeffq*4+16]
-  mova                           m10, [coeffq+ncoeffq*4+32]
-  packssdw                       m10, [coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .first_nonzero
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova        [qcoeffq+ncoeffq*4   ], ymm5
-  mova        [qcoeffq+ncoeffq*4+32], ymm5
-  mova       [dqcoeffq+ncoeffq*4   ], ymm5
-  mova       [dqcoeffq+ncoeffq*4+32], ymm5
-%else
-  mova           [qcoeffq+ncoeffq*2], ymm5
-  mova          [dqcoeffq+ncoeffq*2], ymm5
-%endif
-
-  add                        ncoeffq, mmsize
-
-  punpckhqdq                      m1, m1
-  punpckhqdq                      m2, m2
-  punpckhqdq                      m3, m3
-  punpckhqdq                      m4, m4
-  pxor                            m8, m8
-
-  jmp .ac_only_loop
-
-.first_nonzero:
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
-
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
-
-  pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
-  mova                            m6, [iscanq+ncoeffq*2]    ; m6 = scan[i]
-  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                    ; m6 = scan[i] + 1
-  psubw                          m11, m12                   ; m11 = scan[i] + 1
-  pandn                           m8, m6                    ; m8 = max(eob)
-  pandn                          m13, m11                   ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-
-.ac_only_loop:
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; pack coeff from 32bit to 16bit array
-  mova                            m9, [coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [coeffq+ncoeffq*4+16]
-  mova                           m10, [coeffq+ncoeffq*4+32]
-  packssdw                       m10, [coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, skip this itertion.
-  ; And just write zeros as the result would be.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .rest_nonzero
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova        [qcoeffq+ncoeffq*4+ 0], ymm5
-  mova        [qcoeffq+ncoeffq*4+32], ymm5
-  mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
-  mova       [dqcoeffq+ncoeffq*4+32], ymm5
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], ymm5
-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm5
-%endif
-  add                        ncoeffq, mmsize
-  jnz .ac_only_loop
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                           [r2], ax
-  vzeroupper
-  RET
-
-.rest_nonzero:
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m14
-  punpckhwd                       m6, m14, m6
-  pmovsxwd                       m11, m14
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
-
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m14
-  punpckhwd                       m6, m14, m6
-  pmovsxwd                       m11, m14
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
-
-  pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
-  mova                            m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                    ; m6 = scan[i] + 1
-  psubw                          m11, m12                   ; m11 = scan[i] + 1
-  pandn                          m14, m6                    ; m14 = max(eob)
-  pandn                          m13, m11                   ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jnz .ac_only_loop
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                           [r2], ax
-  vzeroupper
-  RET
-
-  ; Skip-block, i.e. just write all zeroes
-.blank:
-
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
-            qcoeff, dqcoeff, dequant, eob, scan, iscan
-
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-
-DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
-
-  neg                        ncoeffq
-  pxor                            m7, m7
-
-.blank_loop:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova       [dqcoeffq+ncoeffq*4+ 0], ymm7
-  mova       [dqcoeffq+ncoeffq*4+32], ymm7
-  mova        [qcoeffq+ncoeffq*4+ 0], ymm7
-  mova        [qcoeffq+ncoeffq*4+32], ymm7
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm7
-  mova        [qcoeffq+ncoeffq*2+ 0], ymm7
-%endif
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-
-  mov                         [eobq], word 0
-
-  vzeroupper
-  RET
-%endmacro
-
-INIT_XMM avx
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
-
-END
diff --git a/libs/libvpx/vpx_dsp/x86/quantize_sse2.c b/libs/libvpx/vpx_dsp/x86/quantize_sse2.c
index 2c7e431c74..c020b398c3 100644
--- a/libs/libvpx/vpx_dsp/x86/quantize_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -8,37 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
-
-static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
-                        (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
-                        (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
-                        (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-#else
-  return _mm_load_si128((const __m128i *)coeff_ptr);
-#endif
-}
-
-static INLINE void store_coefficients(__m128i coeff_vals,
-                                      tran_low_t *coeff_ptr) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  __m128i one = _mm_set1_epi16(1);
-  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
-  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
-  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
-  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
-  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-#else
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
-#endif
-}
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_x86.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *zbin_ptr,
@@ -47,202 +24,103 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                          uint16_t *eob_ptr, const int16_t *scan_ptr,
                          const int16_t *iscan_ptr) {
-  __m128i zero;
+  const __m128i zero = _mm_setzero_si128();
+  int index = 16;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob, eob0;
+
   (void)scan_ptr;
+  (void)skip_block;
+  assert(!skip_block);
 
-  coeff_ptr += n_coeffs;
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-  if (!skip_block) {
-    __m128i eob;
-    __m128i zbin;
-    __m128i round, quant, dequant, shift;
-    {
-      __m128i coeff0, coeff1;
+  // Setup global values.
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
 
-      // Setup global values
-      {
-        __m128i pw_1;
-        zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        pw_1 = _mm_set1_epi16(1);
-        zbin = _mm_sub_epi16(zbin, pw_1);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-        shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-      }
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
 
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
-        // Do DC and first 15 AC
-        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
-        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
 
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        shift = _mm_unpackhi_epi64(shift, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+  calculate_qcoeff(&qcoeff0, round, quant, shift);
 
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+  shift = _mm_unpackhi_epi64(shift, shift);
 
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+  calculate_qcoeff(&qcoeff1, round, quant, shift);
 
-        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+  // Reinsert signs
+  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
 
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+  // Mask out zbin threshold coeffs
+  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 
-        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
-        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      }
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
 
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
+  coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  coeff1 = calculate_dqcoeff(qcoeff1, dequant);
 
-    // AC only loop
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
+  store_tran_low(coeff0, dqcoeff_ptr);
+  store_tran_low(coeff1, dqcoeff_ptr + 8);
 
-        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
-        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
+  eob =
+      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
 
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
 
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
 
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
 
-        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
 
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 
-        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
-        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      }
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
 
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-    }
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
 
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      store_coefficients(zero, dqcoeff_ptr + n_coeffs);
-      store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
-      store_coefficients(zero, qcoeff_ptr + n_coeffs);
-      store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
+    store_tran_low(coeff0, dqcoeff_ptr + index);
+    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+                        index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+
+    index += 16;
   }
+
+  *eob_ptr = accumulate_eob(eob);
 }
diff --git a/libs/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libs/libvpx/vpx_dsp/x86/quantize_ssse3.c
new file mode 100644
index 0000000000..3f528e1a97
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/quantize_ssse3.c
@@ -0,0 +1,292 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_x86.h"
+
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t *zbin_ptr,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  int index = 16;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob, eob0;
+
+  (void)scan_ptr;
+  (void)skip_block;
+  assert(!skip_block);
+
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  calculate_qcoeff(&qcoeff0, round, quant, shift);
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+  shift = _mm_unpackhi_epi64(shift, shift);
+  calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+  // Reinsert signs
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  // Mask out zbin threshold coeffs
+  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+  store_tran_low(coeff0, dqcoeff_ptr);
+  store_tran_low(coeff1, dqcoeff_ptr + 8);
+
+  eob =
+      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_tran_low(coeff0, dqcoeff_ptr + index);
+    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+                        index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_ssse3(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  int index;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan_ptr;
+  (void)n_coeffs;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // Setup global values.
+  // The 32x32 halves zbin and round.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, one);
+  zbin = _mm_srli_epi16(zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  zbin = _mm_sub_epi16(zbin, one);
+
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  round = _mm_add_epi16(round, one);
+  round = _mm_srli_epi16(round, 1);
+
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+  // I suspect this is not technically OK because quant_shift can be up
+  // to 1 << 16 and shifting up again will outrange that, but the test is not
+  // comprehensive enough to catch that and "it's been that way forever"
+  shift = _mm_slli_epi16(shift, 1);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC.
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+#endif  // CONFIG_HIGHBITDEPTH
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs.
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs.
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    // Un-sign to bias rounding like C.
+    // dequant is almost always negative, so this is probably the backwards way
+    // to handle the sign. However, it matches the previous assembly.
+    coeff0 = _mm_abs_epi16(qcoeff0);
+    coeff1 = _mm_abs_epi16(qcoeff1);
+
+    coeff0 = calculate_dqcoeff(coeff0, dequant);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = calculate_dqcoeff(coeff1, dequant);
+
+    // "Divide" by 2.
+    coeff0 = _mm_srli_epi16(coeff0, 1);
+    coeff1 = _mm_srli_epi16(coeff1, 1);
+
+    coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
+    coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
+
+    store_tran_low(coeff0, dqcoeff_ptr);
+    store_tran_low(coeff1, dqcoeff_ptr + 8);
+
+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
+                       zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < 32 * 32; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      continue;
+    }
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = _mm_abs_epi16(qcoeff0);
+    coeff1 = _mm_abs_epi16(qcoeff1);
+
+    coeff0 = calculate_dqcoeff(coeff0, dequant);
+    coeff1 = calculate_dqcoeff(coeff1, dequant);
+
+    coeff0 = _mm_srli_epi16(coeff0, 1);
+    coeff1 = _mm_srli_epi16(coeff1, 1);
+
+    coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
+    coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
+
+    store_tran_low(coeff0, dqcoeff_ptr + index);
+    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+                        index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/libs/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
deleted file mode 100644
index ca21539173..0000000000
--- a/libs/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
+++ /dev/null
@@ -1,346 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-
-SECTION .text
-
-; TODO(yunqingwang)fix quantize_b code for skip=1 case.
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-  cmp                    dword skipm, 0
-  jne .blank
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-%ifidn %1, b_32x32
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m0, m5
-  paddw                           m1, m5
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [r2q]                ; m3 = dequant
-  psubw                           m0, [pw_1]
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]                 ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea                         coeffq, [  coeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-%else
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-%endif
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; coeff stored as 32bit numbers & require 16bit numbers
-  mova                            m9, [  coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [  coeffq+ncoeffq*4+16]
-  mova                           m10, [  coeffq+ncoeffq*4+32]
-  packssdw                       m10, [  coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                           m11, m8
-  mova                            m6, m8
-  pcmpgtw                         m5, m8
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                            m11, m8
-  mova                            m6, m8
-  pcmpgtw                         m5, m8
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; pack coeff from 32bit to 16bit array
-  mova                            m9, [  coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [  coeffq+ncoeffq*4+16]
-  mova                           m10, [  coeffq+ncoeffq*4+32]
-  packssdw                       m10, [  coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-%ifidn %1, b_32x32
-  pmovmskb                       r6d, m7
-  pmovmskb                       r2d, m12
-  or                              r6, r2
-  jz .skip_iter
-%endif
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pxor                           m11, m11
-  mova                           m11, m14
-  mova                            m6, m14
-  pcmpgtw                         m5, m14
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                           m11, m14
-  mova                            m6, m14
-  pcmpgtw                         m5, m14
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-%ifidn %1, b_32x32
-  jmp .accumulate_eob
-.skip_iter:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova        [qcoeffq+ncoeffq*4+ 0], m5
-  mova        [qcoeffq+ncoeffq*4+16], m5
-  mova        [qcoeffq+ncoeffq*4+32], m5
-  mova        [qcoeffq+ncoeffq*4+48], m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m5
-  mova       [dqcoeffq+ncoeffq*4+16], m5
-  mova       [dqcoeffq+ncoeffq*4+32], m5
-  mova       [dqcoeffq+ncoeffq*4+48], m5
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
-%endif
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-%endif
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                             [r2], r6
-  RET
-
-  ; skip-block, i.e. just write all zeroes
-.blank:
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
-  neg                        ncoeffq
-  pxor                            m7, m7
-.blank_loop:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova       [dqcoeffq+ncoeffq*4+ 0], m7
-  mova       [dqcoeffq+ncoeffq*4+16], m7
-  mova       [dqcoeffq+ncoeffq*4+32], m7
-  mova       [dqcoeffq+ncoeffq*4+48], m7
-  mova        [qcoeffq+ncoeffq*4+ 0], m7
-  mova        [qcoeffq+ncoeffq*4+16], m7
-  mova        [qcoeffq+ncoeffq*4+32], m7
-  mova        [qcoeffq+ncoeffq*4+48], m7
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
-%endif
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-  mov                    word [eobq], 0
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
diff --git a/libs/libvpx/vpx_dsp/x86/quantize_x86.h b/libs/libvpx/vpx_dsp/x86/quantize_x86.h
new file mode 100644
index 0000000000..34928fbb56
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/quantize_x86.h
@@ -0,0 +1,78 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+
+static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
+                                 const int16_t *round_ptr, __m128i *round,
+                                 const int16_t *quant_ptr, __m128i *quant,
+                                 const int16_t *dequant_ptr, __m128i *dequant,
+                                 const int16_t *shift_ptr, __m128i *shift) {
+  *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  *round = _mm_load_si128((const __m128i *)round_ptr);
+  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+  *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)shift_ptr);
+}
+
+// With ssse3 and later abs() and sign() are preferred.
+static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi16(a, sign);
+}
+
+static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+                                    const __m128i quant, const __m128i shift) {
+  __m128i tmp, qcoeff;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  *coeff = _mm_mulhi_epi16(qcoeff, shift);
+}
+
+static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
+  return _mm_mullo_epi16(qcoeff, dequant);
+}
+
+// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
+// to zbin to add 1 to the index in 'scan'.
+static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+                                   const __m128i zbin_mask0,
+                                   const __m128i zbin_mask1,
+                                   const int16_t *scan_ptr, const int index,
+                                   const __m128i zero) {
+  const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
+  const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
+  __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
+  __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+  __m128i eob0, eob1;
+  // Add one to convert from indices to counts
+  scan0 = _mm_sub_epi16(scan0, zbin_mask0);
+  scan1 = _mm_sub_epi16(scan1, zbin_mask1);
+  eob0 = _mm_andnot_si128(zero_coeff0, scan0);
+  eob1 = _mm_andnot_si128(zero_coeff1, scan1);
+  return _mm_max_epi16(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+  __m128i eob_shuffled;
+  eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
diff --git a/libs/libvpx/vpx_dsp/x86/sad4d_avx512.c b/libs/libvpx/vpx_dsp/x86/sad4d_avx512.c
new file mode 100644
index 0000000000..5f2ab6ea71
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/sad4d_avx512.c
@@ -0,0 +1,83 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>  // AVX512
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
+                            const uint8_t *const ref[4], int ref_stride,
+                            uint32_t res[4]) {
+  __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+  __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  __m512i sum_mlow, sum_mhigh;
+  int i;
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+  sum_ref0 = _mm512_set1_epi16(0);
+  sum_ref1 = _mm512_set1_epi16(0);
+  sum_ref2 = _mm512_set1_epi16(0);
+  sum_ref3 = _mm512_set1_epi16(0);
+  for (i = 0; i < 64; i++) {
+    // load src and all refs
+    src_reg = _mm512_loadu_si512((const __m512i *)src);
+    ref0_reg = _mm512_loadu_si512((const __m512i *)ref0);
+    ref1_reg = _mm512_loadu_si512((const __m512i *)ref1);
+    ref2_reg = _mm512_loadu_si512((const __m512i *)ref2);
+    ref3_reg = _mm512_loadu_si512((const __m512i *)ref3);
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg);
+    // sum every ref-i
+    sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+  {
+    __m256i sum256;
+    __m128i sum128;
+    // in sum_ref-i the result is saved in the first 4 bytes
+    // the other 4 bytes are zeroed.
+    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+    sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4);
+    sum_ref3 = _mm512_bslli_epi128(sum_ref3, 4);
+
+    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+    sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1);
+    sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3);
+
+    // merge every 64 bit from each sum_ref-i
+    sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2);
+    sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2);
+
+    // add the low 64 bit to the high 64 bit
+    sum_mlow = _mm512_add_epi32(sum_mlow, sum_mhigh);
+
+    // add the low 128 bit to the high 128 bit
+    sum256 = _mm256_add_epi32(_mm512_castsi512_si256(sum_mlow),
+                              _mm512_extracti32x8_epi32(sum_mlow, 1));
+    sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256),
+                           _mm256_extractf128_si256(sum256, 1));
+
+    _mm_storeu_si128((__m128i *)(res), sum128);
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/x86/sad_sse3.asm b/libs/libvpx/vpx_dsp/x86/sad_sse3.asm
index 18279bdb9d..175dcc0895 100644
--- a/libs/libvpx/vpx_dsp/x86/sad_sse3.asm
+++ b/libs/libvpx/vpx_dsp/x86/sad_sse3.asm
@@ -165,6 +165,8 @@
         paddw           mm7,       mm3
 %endmacro
 
+SECTION .text
+
 ;void int vpx_sad16x16x3_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
diff --git a/libs/libvpx/vpx_dsp/x86/sad_sse4.asm b/libs/libvpx/vpx_dsp/x86/sad_sse4.asm
index bc67447971..03999dfca2 100644
--- a/libs/libvpx/vpx_dsp/x86/sad_sse4.asm
+++ b/libs/libvpx/vpx_dsp/x86/sad_sse4.asm
@@ -165,6 +165,8 @@
     movdqa          [rdi + 16],    xmm2
 %endmacro
 
+SECTION .text
+
 ;void vpx_sad16x16x8_sse4_1(
 ;    const unsigned char *src_ptr,
 ;    int  src_stride,
diff --git a/libs/libvpx/vpx_dsp/x86/sad_ssse3.asm b/libs/libvpx/vpx_dsp/x86/sad_ssse3.asm
index 49f204fa04..7cf93cf51d 100644
--- a/libs/libvpx/vpx_dsp/x86/sad_ssse3.asm
+++ b/libs/libvpx/vpx_dsp/x86/sad_ssse3.asm
@@ -146,6 +146,8 @@
 
 %endmacro
 
+SECTION .text
+
 ;void int vpx_sad16x16x3_ssse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
diff --git a/libs/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm b/libs/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
index 6d58321e03..300fa8aabc 100644
--- a/libs/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
+++ b/libs/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
@@ -44,6 +44,9 @@
         paddd           %1, xmm1
         SUM_ACROSS_Q    %1
 %endmacro
+
+SECTION .text
+
 ;void ssim_parms_sse2(
 ;    unsigned char *s,
 ;    int sp,
diff --git a/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c b/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c
index bc5362e10f..026d0ca2f2 100644
--- a/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c
@@ -123,6 +123,7 @@ uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
     return vpx_sum_squares_2d_i16_4x4_sse2(src, stride);
   } else {
     // Generic case
+    assert(size % 8 == 0);
     return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size);
   }
 }
diff --git a/libs/libvpx/vpx_dsp/x86/transpose_sse2.h b/libs/libvpx/vpx_dsp/x86/transpose_sse2.h
new file mode 100644
index 0000000000..8a0119ca7e
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/transpose_sse2.h
@@ -0,0 +1,367 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_
+#define VPX_DSP_X86_TRANSPOSE_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+
+static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  return _mm_unpacklo_epi16(a0, a1);
+}
+
+static INLINE void transpose_8bit_8x8(const __m128i *const in,
+                                      __m128i *const out) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03 04 05 06 07
+  // in[1]: 10 11 12 13 14 15 16 17
+  // in[2]: 20 21 22 23 24 25 26 27
+  // in[3]: 30 31 32 33 34 35 36 37
+  // in[4]: 40 41 42 43 44 45 46 47
+  // in[5]: 50 51 52 53 54 55 56 57
+  // in[6]: 60 61 62 63 64 65 66 67
+  // in[7]: 70 71 72 73 74 75 76 77
+  // to:
+  // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+  // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+  // Unpack 16 bit elements resulting in:
+  // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+  // Unpack 32 bit elements resulting in:
+  // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
+  const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
+  const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
+  const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30 40 50 60 70
+  // out[1]: 01 11 21 31 41 51 61 71
+  // out[2]: 02 12 22 32 42 52 62 72
+  // out[3]: 03 13 23 33 43 53 63 73
+  // out[4]: 04 14 24 34 44 54 64 74
+  // out[5]: 05 15 25 35 45 55 65 75
+  // out[6]: 06 16 26 36 46 56 66 76
+  // out[7]: 07 17 27 37 47 57 67 77
+  out[0] = _mm_unpacklo_epi64(c0, c0);
+  out[1] = _mm_unpackhi_epi64(c0, c0);
+  out[2] = _mm_unpacklo_epi64(c1, c1);
+  out[3] = _mm_unpackhi_epi64(c1, c1);
+  out[4] = _mm_unpacklo_epi64(c2, c2);
+  out[5] = _mm_unpackhi_epi64(c2, c2);
+  out[6] = _mm_unpacklo_epi64(c3, c3);
+  out[7] = _mm_unpackhi_epi64(c3, c3);
+}
+
+static INLINE void transpose_16bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // out[0]: 00 10 20 30  01 11 21 31
+  // out[1]: 02 12 22 32  03 13 23 33
+  out[0] = _mm_unpacklo_epi32(a0, a1);
+  out[1] = _mm_unpackhi_epi32(a0, a1);
+}
+
+static INLINE void transpose_16bit_4x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // in[4]: 40 41 42 43  XX XX XX XX
+  // in[5]: 50 51 52 53  XX XX XX XX
+  // in[6]: 60 61 62 63  XX XX XX XX
+  // in[7]: 70 71 72 73  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 02 12 22 32  03 13 23 33
+  // b3: 42 52 62 72  43 53 63 73
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b2, b3);
+  out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+static INLINE void transpose_16bit_8x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+  // in[4]: 40 41 42 43  44 45 46 47
+  // in[5]: 50 51 52 53  54 55 56 57
+  // in[6]: 60 61 62 63  64 65 66 67
+  // in[7]: 70 71 72 73  74 75 76 77
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  // a6:    44 54 45 55  46 56 47 57
+  // a7:    64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 04 14 24 34  05 15 25 35
+  // b3: 44 54 64 74  45 55 65 75
+  // b4: 02 12 22 32  03 13 23 33
+  // b5: 42 52 62 72  43 53 63 73
+  // b6: 06 16 26 36  07 17 27 37
+  // b7: 46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  // out[4]: 04 14 24 34  44 54 64 74
+  // out[5]: 05 15 25 35  45 55 65 75
+  // out[6]: 06 16 26 36  46 56 66 76
+  // out[7]: 07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b4, b5);
+  out[3] = _mm_unpackhi_epi64(b4, b5);
+  out[4] = _mm_unpacklo_epi64(b2, b3);
+  out[5] = _mm_unpackhi_epi64(b2, b3);
+  out[6] = _mm_unpacklo_epi64(b6, b7);
+  out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+// Transpose in-place
+static INLINE void transpose_16bit_16x16(__m128i *const left,
+                                         __m128i *const right) {
+  __m128i tbuf[8];
+  transpose_16bit_8x8(left, left);
+  transpose_16bit_8x8(right, tbuf);
+  transpose_16bit_8x8(left + 8, right);
+  transpose_16bit_8x8(right + 8, right + 8);
+
+  left[8] = tbuf[0];
+  left[9] = tbuf[1];
+  left[10] = tbuf[2];
+  left[11] = tbuf[3];
+  left[12] = tbuf[4];
+  left[13] = tbuf[5];
+  left[14] = tbuf[6];
+  left[15] = tbuf[7];
+}
+
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+}
+
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+                                         __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // in[4]: 04 05 06 07
+  // in[5]: 14 15 16 17
+  // in[6]: 24 25 26 27
+  // in[7]: 34 35 36 37
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+  // a4:    04 14 05 15
+  // a5:    24 34 25 35
+  // a6:    06 16 07 17
+  // a7:    26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+static INLINE void transpose_32bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 04 05 06 07
+  // in[2]: 10 11 12 13
+  // in[3]: 14 15 16 17
+  // in[4]: 20 21 22 23
+  // in[5]: 24 25 26 27
+  // in[6]: 30 31 32 33
+  // in[7]: 34 35 36 37
+  // to:
+  // a0: 00 10 01 11
+  // a1: 20 30 21 31
+  // a2: 02 12 03 13
+  // a3: 22 32 23 33
+  // a4: 04 14 05 15
+  // a5: 24 34 25 35
+  // a6: 06 16 07 17
+  // a7: 26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+#endif  // VPX_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h
index f8edb1b787..0a9542c85b 100644
--- a/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h
+++ b/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h
@@ -18,6 +18,9 @@
   _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
                 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
 
+#define pair_set_epi32(a, b) \
+  _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
+
 #define dual_set_epi16(a, b)                                            \
   _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
                 (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
diff --git a/libs/libvpx/vpx_dsp/x86/variance_avx2.c b/libs/libvpx/vpx_dsp/x86/variance_avx2.c
index 8428e0520d..d15a89c746 100644
--- a/libs/libvpx/vpx_dsp/x86/variance_avx2.c
+++ b/libs/libvpx/vpx_dsp/x86/variance_avx2.c
@@ -7,16 +7,592 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
+#include <immintrin.h>  // AVX2
+
 #include "./vpx_dsp_rtcd.h"
 
+/* clang-format off */
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
+  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
+  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
+  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
+  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
+  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
+  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
+  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
+  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+  6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
+  6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
+  4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
+  4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
+  2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
+  2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
+};
+
+DECLARE_ALIGNED(32, static const int8_t, adjacent_sub_avx2[32]) = {
+  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,
+  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1
+};
+/* clang-format on */
+
+void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
+                          const unsigned char *ref_ptr, int recon_stride,
+                          unsigned int *sse, int *sum) {
+  unsigned int i, src_2strides, ref_2strides;
+  __m256i sum_reg = _mm256_setzero_si256();
+  __m256i sse_reg = _mm256_setzero_si256();
+  // process two 16 byte locations in a 256 bit register
+  src_2strides = source_stride << 1;
+  ref_2strides = recon_stride << 1;
+  for (i = 0; i < 8; ++i) {
+    // convert up values in 128 bit registers across lanes
+    const __m256i src0 =
+        _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const *)(src_ptr)));
+    const __m256i src1 = _mm256_cvtepu8_epi16(
+        _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)));
+    const __m256i ref0 =
+        _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const *)(ref_ptr)));
+    const __m256i ref1 = _mm256_cvtepu8_epi16(
+        _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)));
+    const __m256i diff0 = _mm256_sub_epi16(src0, ref0);
+    const __m256i diff1 = _mm256_sub_epi16(src1, ref1);
+    const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+    const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+
+    // add to the running totals
+    sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff0, diff1));
+    sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd0, madd1));
+
+    src_ptr += src_2strides;
+    ref_ptr += ref_2strides;
+  }
+  {
+    // extract the low lane and add it to the high lane
+    const __m128i sum_reg_128 = _mm_add_epi16(
+        _mm256_castsi256_si128(sum_reg), _mm256_extractf128_si256(sum_reg, 1));
+    const __m128i sse_reg_128 = _mm_add_epi32(
+        _mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1));
+
+    // sum upper and lower 64 bits together and convert up to 32 bit values
+    const __m128i sum_reg_64 =
+        _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8));
+    const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64);
+
+    // unpack sse and sum registers and add
+    const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, sum_int32);
+    const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, sum_int32);
+    const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+    // perform the final summation and extract the results
+    const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+    *((int *)sse) = _mm_cvtsi128_si32(res);
+    *((int *)sum) = _mm_extract_epi32(res, 1);
+  }
+}
+
+static void get32x16var_avx2(const unsigned char *src_ptr, int source_stride,
+                             const unsigned char *ref_ptr, int recon_stride,
+                             unsigned int *sse, int *sum) {
+  unsigned int i, src_2strides, ref_2strides;
+  const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2);
+  __m256i sum_reg = _mm256_setzero_si256();
+  __m256i sse_reg = _mm256_setzero_si256();
+
+  // process 64 elements in an iteration
+  src_2strides = source_stride << 1;
+  ref_2strides = recon_stride << 1;
+  for (i = 0; i < 8; i++) {
+    const __m256i src0 = _mm256_loadu_si256((__m256i const *)(src_ptr));
+    const __m256i src1 =
+        _mm256_loadu_si256((__m256i const *)(src_ptr + source_stride));
+    const __m256i ref0 = _mm256_loadu_si256((__m256i const *)(ref_ptr));
+    const __m256i ref1 =
+        _mm256_loadu_si256((__m256i const *)(ref_ptr + recon_stride));
+
+    // unpack into pairs of source and reference values
+    const __m256i src_ref0 = _mm256_unpacklo_epi8(src0, ref0);
+    const __m256i src_ref1 = _mm256_unpackhi_epi8(src0, ref0);
+    const __m256i src_ref2 = _mm256_unpacklo_epi8(src1, ref1);
+    const __m256i src_ref3 = _mm256_unpackhi_epi8(src1, ref1);
+
+    // subtract adjacent elements using src*1 + ref*-1
+    const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+    const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+    const __m256i diff2 = _mm256_maddubs_epi16(src_ref2, adj_sub);
+    const __m256i diff3 = _mm256_maddubs_epi16(src_ref3, adj_sub);
+    const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+    const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+    const __m256i madd2 = _mm256_madd_epi16(diff2, diff2);
+    const __m256i madd3 = _mm256_madd_epi16(diff3, diff3);
+
+    // add to the running totals
+    sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff0, diff1));
+    sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff2, diff3));
+    sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd0, madd1));
+    sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd2, madd3));
+
+    src_ptr += src_2strides;
+    ref_ptr += ref_2strides;
+  }
+
+  {
+    // extract the low lane and add it to the high lane
+    const __m128i sum_reg_128 = _mm_add_epi16(
+        _mm256_castsi256_si128(sum_reg), _mm256_extractf128_si256(sum_reg, 1));
+    const __m128i sse_reg_128 = _mm_add_epi32(
+        _mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1));
+
+    // sum upper and lower 64 bits together and convert up to 32 bit values
+    const __m128i sum_reg_64 =
+        _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8));
+    const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64);
+
+    // unpack sse and sum registers and add
+    const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, sum_int32);
+    const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, sum_int32);
+    const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+    // perform the final summation and extract the results
+    const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+    *((int *)sse) = _mm_cvtsi128_si32(res);
+    *((int *)sum) = _mm_extract_epi32(res, 1);
+  }
+}
+
+#define FILTER_SRC(filter)                               \
+  /* filter the source */                                \
+  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+                                                         \
+  /* add 8 to source */                                  \
+  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
+  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
+                                                         \
+  /* divide source by 16 */                              \
+  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
+  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define CALC_SUM_SSE_INSIDE_LOOP                          \
+  /* expand each byte to 2 bytes */                       \
+  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
+  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
+  /* source - dest */                                     \
+  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
+  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
+  /* caculate sum */                                      \
+  *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_lo);      \
+  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+  *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_hi);      \
+  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+  /* calculate sse */                                     \
+  *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_lo);      \
+  *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE                                                   \
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
+                                                                           \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
+                                                                           \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
+  *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
+                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src);
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      sec += sec_stride;
+    } else {
+      exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+// (x == 0, y == 4) or (x == 4, y == 0).  sstep determines the direction.
+static INLINE void spv32_half_zero(const uint8_t *src, int src_stride,
+                                   const uint8_t *dst, int dst_stride,
+                                   const uint8_t *sec, int sec_stride,
+                                   int do_sec, int height, __m256i *sum_reg,
+                                   __m256i *sse_reg, int sstep) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
+    const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      sec += sec_stride;
+    } else {
+      exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg);
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg) {
+  spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, sum_reg, sse_reg, src_stride);
+}
+
+static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg) {
+  spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, sum_reg, sse_reg, 1);
+}
+
+static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+  const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+  __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b);
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  src += src_stride;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)(src));
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+    const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+    const __m256i current_avg = _mm256_avg_epu8(prev_src_avg, src_avg);
+    prev_src_avg = src_avg;
+
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      sec += sec_stride;
+    } else {
+      exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg);
+    }
+    // save current source average
+    CALC_SUM_SSE_INSIDE_LOOP
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+// (x == 0, y == bil) or (x == 4, y == bil).  sstep determines the direction.
+static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride,
+                                    const uint8_t *dst, int dst_stride,
+                                    const uint8_t *sec, int sec_stride,
+                                    int do_sec, int height, __m256i *sum_reg,
+                                    __m256i *sse_reg, int offset, int sstep) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i pw8 = _mm256_set1_epi16(8);
+  const __m256i filter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (offset << 5)));
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
+    exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+    exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+    FILTER_SRC(filter)
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
+      sec += sec_stride;
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg,
+                               int y_offset) {
+  spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                   height, sum_reg, sse_reg, y_offset, src_stride);
+}
+
+static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg,
+                               int x_offset) {
+  spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                   height, sum_reg, sse_reg, x_offset, 1);
+}
+
+static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg,
+                               int y_offset) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i pw8 = _mm256_set1_epi16(8);
+  const __m256i filter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5)));
+  const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+  const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+  __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b);
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  src += src_stride;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+    const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+    exp_src_lo = _mm256_unpacklo_epi8(prev_src_avg, src_avg);
+    exp_src_hi = _mm256_unpackhi_epi8(prev_src_avg, src_avg);
+    prev_src_avg = src_avg;
+
+    FILTER_SRC(filter)
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      sec += sec_stride;
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg,
+                               int x_offset) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i pw8 = _mm256_set1_epi16(8);
+  const __m256i filter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5)));
+  const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+  const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i src_reg, src_pack;
+  int i;
+  exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b);
+  exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b);
+  FILTER_SRC(filter)
+  // convert each 16 bit to 8 bit to each low and high lane source
+  src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+  src += src_stride;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+    exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+    exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+    FILTER_SRC(filter)
+
+    src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+    // average between previous pack to the current
+    src_pack = _mm256_avg_epu8(src_pack, src_reg);
+
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg);
+      sec += sec_stride;
+    } else {
+      exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg);
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    src_pack = src_reg;
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg,
+                               int x_offset, int y_offset) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i pw8 = _mm256_set1_epi16(8);
+  const __m256i xfilter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5)));
+  const __m256i yfilter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5)));
+  const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+  const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i prev_src_pack, src_pack;
+  int i;
+  exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b);
+  exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b);
+  FILTER_SRC(xfilter)
+  // convert each 16 bit to 8 bit to each low and high lane source
+  prev_src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+  src += src_stride;
+
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+    exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+    exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+    FILTER_SRC(xfilter)
+    src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+    // merge previous pack to current pack source
+    exp_src_lo = _mm256_unpacklo_epi8(prev_src_pack, src_pack);
+    exp_src_hi = _mm256_unpackhi_epi8(prev_src_pack, src_pack);
+
+    FILTER_SRC(yfilter)
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      sec += sec_stride;
+    }
+
+    prev_src_pack = src_pack;
+
+    CALC_SUM_SSE_INSIDE_LOOP
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride,
+                                  int x_offset, int y_offset,
+                                  const uint8_t *dst, int dst_stride,
+                                  const uint8_t *sec, int sec_stride,
+                                  int do_sec, int height, unsigned int *sse) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  __m256i sum_reg = _mm256_setzero_si256();
+  __m256i sse_reg = _mm256_setzero_si256();
+  __m256i sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  int sum;
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      spv32_x0_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg);
+      // x_offset = 0 and y_offset = 4
+    } else if (y_offset == 4) {
+      spv32_x0_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg);
+      // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      spv32_x0_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg, y_offset);
+    }
+    // x_offset = 4  and y_offset = 0
+  } else if (x_offset == 4) {
+    if (y_offset == 0) {
+      spv32_x4_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg);
+      // x_offset = 4  and y_offset = 4
+    } else if (y_offset == 4) {
+      spv32_x4_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg);
+      // x_offset = 4  and y_offset = bilin interpolation
+    } else {
+      spv32_x4_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg, y_offset);
+    }
+    // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      spv32_xb_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg, x_offset);
+      // x_offset = bilin interpolation and y_offset = 4
+    } else if (y_offset == 4) {
+      spv32_xb_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg, x_offset);
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      spv32_xb_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg, x_offset, y_offset);
+    }
+  }
+  CALC_SUM_AND_SSE
+  return sum;
+}
+
+static unsigned int sub_pixel_variance32xh_avx2(
+    const uint8_t *src, int src_stride, int x_offset, int y_offset,
+    const uint8_t *dst, int dst_stride, int height, unsigned int *sse) {
+  return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
+                         NULL, 0, 0, height, sse);
+}
+
+static unsigned int sub_pixel_avg_variance32xh_avx2(
+    const uint8_t *src, int src_stride, int x_offset, int y_offset,
+    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
+    int height, unsigned int *sse) {
+  return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
+                         sec, sec_stride, 1, height, sse);
+}
+
 typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
                              const uint8_t *ref, int ref_stride,
                              unsigned int *sse, int *sum);
 
-void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, unsigned int *sse,
-                          int *sum);
-
 static void variance_avx2(const uint8_t *src, int src_stride,
                           const uint8_t *ref, int ref_stride, int w, int h,
                           unsigned int *sse, int *sum, get_var_avx2 var_fn,
@@ -44,7 +620,7 @@ unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
                 vpx_get16x16var_avx2, 16);
-  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
 }
 
 unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
@@ -60,7 +636,7 @@ unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
                                     unsigned int *sse) {
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
-                vpx_get32x32var_avx2, 32);
+                get32x16var_avx2, 32);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
 }
 
@@ -69,7 +645,7 @@ unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
                                     unsigned int *sse) {
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
-                vpx_get32x32var_avx2, 32);
+                get32x16var_avx2, 32);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
 }
 
@@ -78,7 +654,7 @@ unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
                                     unsigned int *sse) {
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
-                vpx_get32x32var_avx2, 32);
+                get32x16var_avx2, 32);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
 }
 
@@ -87,32 +663,22 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
                                     unsigned int *sse) {
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
-                vpx_get32x32var_avx2, 32);
+                get32x16var_avx2, 32);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
 }
 
-unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse);
-
-unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
-    int height, unsigned int *sseptr);
-
 unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
                                               int src_stride, int x_offset,
                                               int y_offset, const uint8_t *dst,
                                               int dst_stride,
                                               unsigned int *sse) {
   unsigned int sse1;
-  const int se1 = vpx_sub_pixel_variance32xh_avx2(
+  const int se1 = sub_pixel_variance32xh_avx2(
       src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
   unsigned int sse2;
   const int se2 =
-      vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
-                                      dst + 32, dst_stride, 64, &sse2);
+      sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
+                                  dst + 32, dst_stride, 64, &sse2);
   const int se = se1 + se2;
   *sse = sse1 + sse2;
   return *sse - (uint32_t)(((int64_t)se * se) >> 12);
@@ -123,7 +689,7 @@ unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
                                               int y_offset, const uint8_t *dst,
                                               int dst_stride,
                                               unsigned int *sse) {
-  const int se = vpx_sub_pixel_variance32xh_avx2(
+  const int se = sub_pixel_variance32xh_avx2(
       src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
   return *sse - (uint32_t)(((int64_t)se * se) >> 10);
 }
@@ -132,10 +698,10 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
     const uint8_t *src, int src_stride, int x_offset, int y_offset,
     const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
   unsigned int sse1;
-  const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(
+  const int se1 = sub_pixel_avg_variance32xh_avx2(
       src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
   unsigned int sse2;
-  const int se2 = vpx_sub_pixel_avg_variance32xh_avx2(
+  const int se2 = sub_pixel_avg_variance32xh_avx2(
       src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
       64, 64, &sse2);
   const int se = se1 + se2;
@@ -149,7 +715,7 @@ unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
     const uint8_t *src, int src_stride, int x_offset, int y_offset,
     const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
   // Process 32 elements in parallel.
-  const int se = vpx_sub_pixel_avg_variance32xh_avx2(
+  const int se = sub_pixel_avg_variance32xh_avx2(
       src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
   return *sse - (uint32_t)(((int64_t)se * se) >> 10);
 }
diff --git a/libs/libvpx/vpx_dsp/x86/variance_impl_avx2.c b/libs/libvpx/vpx_dsp/x86/variance_impl_avx2.c
deleted file mode 100644
index 51e6b19ad1..0000000000
--- a/libs/libvpx/vpx_dsp/x86/variance_impl_avx2.c
+++ /dev/null
@@ -1,708 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h>  // AVX2
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
-
-/* clang-format off */
-DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
-  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
-  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
-  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
-  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
-  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
-  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
-  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
-  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
-  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-  6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
-  6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
-  4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
-  4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
-  2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
-  2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
-};
-/* clang-format on */
-
-void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
-                          const unsigned char *ref_ptr, int recon_stride,
-                          unsigned int *SSE, int *Sum) {
-  __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-  __m256i ref_expand_high, madd_low, madd_high;
-  unsigned int i, src_2strides, ref_2strides;
-  __m256i zero_reg = _mm256_set1_epi16(0);
-  __m256i sum_ref_src = _mm256_set1_epi16(0);
-  __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-  // processing two strides in a 256 bit register reducing the number
-  // of loop stride by half (comparing to the sse2 code)
-  src_2strides = source_stride << 1;
-  ref_2strides = recon_stride << 1;
-  for (i = 0; i < 8; i++) {
-    src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
-    src = _mm256_inserti128_si256(
-        src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
-
-    ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
-    ref = _mm256_inserti128_si256(
-        ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
-
-    // expanding to 16 bit each lane
-    src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-    src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-    ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-    ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-    // src-ref
-    src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-    src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-    // madd low (src - ref)
-    madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-    // add high to low
-    src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-    // madd high (src - ref)
-    madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-    sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-    // add high to low
-    madd_ref_src =
-        _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
-
-    src_ptr += src_2strides;
-    ref_ptr += ref_2strides;
-  }
-
-  {
-    __m128i sum_res, madd_res;
-    __m128i expand_sum_low, expand_sum_high, expand_sum;
-    __m128i expand_madd_low, expand_madd_high, expand_madd;
-    __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-    // extract the low lane and add it to the high lane
-    sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
-                            _mm256_extractf128_si256(sum_ref_src, 1));
-
-    madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
-                             _mm256_extractf128_si256(madd_ref_src, 1));
-
-    // padding each 2 bytes with another 2 zeroed bytes
-    expand_sum_low =
-        _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
-    expand_sum_high =
-        _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
-
-    // shifting the sign 16 bits right
-    expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
-    expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
-
-    expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
-
-    // expand each 32 bits of the madd result to 64 bits
-    expand_madd_low =
-        _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
-    expand_madd_high =
-        _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
-
-    expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
-
-    ex_expand_sum_low =
-        _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
-    ex_expand_sum_high =
-        _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
-
-    ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
-    // shift 8 bytes eight
-    madd_res = _mm_srli_si128(expand_madd, 8);
-    sum_res = _mm_srli_si128(ex_expand_sum, 8);
-
-    madd_res = _mm_add_epi32(madd_res, expand_madd);
-    sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
-
-    *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
-
-    *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
-  }
-}
-
-void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
-                          const unsigned char *ref_ptr, int recon_stride,
-                          unsigned int *SSE, int *Sum) {
-  __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-  __m256i ref_expand_high, madd_low, madd_high;
-  unsigned int i;
-  __m256i zero_reg = _mm256_set1_epi16(0);
-  __m256i sum_ref_src = _mm256_set1_epi16(0);
-  __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-  // processing 32 elements in parallel
-  for (i = 0; i < 16; i++) {
-    src = _mm256_loadu_si256((__m256i const *)(src_ptr));
-
-    ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
-
-    // expanding to 16 bit each lane
-    src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-    src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-    ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-    ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-    // src-ref
-    src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-    src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-    // madd low (src - ref)
-    madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-    // add high to low
-    src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-    // madd high (src - ref)
-    madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-    sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-    // add high to low
-    madd_ref_src =
-        _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
-
-    src_ptr += source_stride;
-    ref_ptr += recon_stride;
-  }
-
-  {
-    __m256i expand_sum_low, expand_sum_high, expand_sum;
-    __m256i expand_madd_low, expand_madd_high, expand_madd;
-    __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-    // padding each 2 bytes with another 2 zeroed bytes
-    expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
-    expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
-
-    // shifting the sign 16 bits right
-    expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
-    expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
-
-    expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
-
-    // expand each 32 bits of the madd result to 64 bits
-    expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
-    expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
-
-    expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
-
-    ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
-    ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
-
-    ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
-    // shift 8 bytes eight
-    madd_ref_src = _mm256_srli_si256(expand_madd, 8);
-    sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
-
-    madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
-    sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
-
-    // extract the low lane and the high lane and add the results
-    *((int *)SSE) =
-        _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
-        _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
-
-    *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
-                    _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
-  }
-}
-
-#define FILTER_SRC(filter)                               \
-  /* filter the source */                                \
-  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
-  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
-                                                         \
-  /* add 8 to source */                                  \
-  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
-  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
-                                                         \
-  /* divide source by 16 */                              \
-  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
-  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-#define MERGE_WITH_SRC(src_reg, reg)               \
-  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
-  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
-
-#define LOAD_SRC_DST                                    \
-  /* load source and destination */                     \
-  src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
-  dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
-
-#define AVG_NEXT_SRC(src_reg, size_stride)                                 \
-  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
-  /* average between current and next stride source */                     \
-  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-
-#define MERGE_NEXT_SRC(src_reg, size_stride)                               \
-  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
-  MERGE_WITH_SRC(src_reg, src_next_reg)
-
-#define CALC_SUM_SSE_INSIDE_LOOP                          \
-  /* expand each byte to 2 bytes */                       \
-  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
-  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
-  /* source - dest */                                     \
-  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
-  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
-  /* caculate sum */                                      \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
-  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
-  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
-  /* calculate sse */                                     \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-// final calculation to sum and sse
-#define CALC_SUM_AND_SSE                                                   \
-  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
-  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
-  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
-  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
-                                                                           \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
-                                                                           \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
-  *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
-                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
-  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
-        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
-
-unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse) {
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        // save current source average
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-    // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src_pack = src_reg;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-  }
-  CALC_SUM_AND_SSE
-  return sum;
-}
-
-unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
-    int height, unsigned int *sse) {
-  __m256i sec_reg;
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } else if (y_offset == 8) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        sec += sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-    // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        sec += sec_stride;
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-  }
-  CALC_SUM_AND_SSE
-  return sum;
-}
diff --git a/libs/libvpx/vpx_dsp/x86/variance_sse2.c b/libs/libvpx/vpx_dsp/x86/variance_sse2.c
index 1161da4914..8d8bf183b2 100644
--- a/libs/libvpx/vpx_dsp/x86/variance_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/variance_sse2.c
@@ -222,7 +222,7 @@ unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
                                     unsigned int *sse) {
   int sum;
   vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
 }
 
 unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
index 727d9d1156..4f164afeb4 100644
--- a/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
+++ b/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
@@ -41,38 +41,38 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
 
 // void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
+//                               const InterpKernel *filter, int x0_q4,
+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                               int w, int h);
 // void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
+//                              const InterpKernel *filter, int x0_q4,
+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
 // void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
+//                                   const InterpKernel *filter, int x0_q4,
+//                                   int32_t x_step_q4, int y0_q4,
+//                                   int y_step_q4, int w, int h);
 // void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                  uint8_t *dst, ptrdiff_t dst_stride,
-//                                  const int16_t *filter_x, int x_step_q4,
-//                                  const int16_t *filter_y, int y_step_q4,
+//                                  const InterpKernel *filter, int x0_q4,
+//                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                  int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2);
 
 // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                         uint8_t *dst, ptrdiff_t dst_stride,
-//                         const int16_t *filter_x, int x_step_q4,
-//                         const int16_t *filter_y, int y_step_q4,
+//                         const InterpKernel *filter, int x0_q4,
+//                         int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                         int w, int h);
 // void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                             uint8_t *dst, ptrdiff_t dst_stride,
-//                             const int16_t *filter_x, int x_step_q4,
-//                             const int16_t *filter_y, int y_step_q4,
+//                             const InterpKernel *filter, int x0_q4,
+//                             int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                             int w, int h);
 FUN_CONV_2D(, sse2);
 FUN_CONV_2D(avg_, sse2);
@@ -140,22 +140,22 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
 //                                         const int16_t *filter_y,
 //                                         int y_step_q4,
 //                                         int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
                  sse2);
 
 // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
+//                                const InterpKernel *filter, int x0_q4,
+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                int w, int h, int bd);
 // void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h, int bd);
+//                                    const InterpKernel *filter, int x0_q4,
+//                                    int32_t x_step_q4, int y0_q4,
+//                                    int y_step_q4, int w, int h, int bd);
 HIGH_FUN_CONV_2D(, sse2);
 HIGH_FUN_CONV_2D(avg_, sse2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
index e2311c1167..3f444e2e6a 100644
--- a/libs/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -20,21 +20,19 @@ SECTION .text
 %endif
 %ifidn %2, highbd
 %define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
                                               dst, dst_stride, \
-                                              fx, fxs, fy, fys, w, h, bd
+                                              f, fxo, fxs, fyo, fys, w, h, bd
 %else
 %define pavg pavgb
-cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
                                            dst, dst_stride, \
-                                           fx, fxs, fy, fys, w, h
+                                           f, fxo, fxs, fyo, fys, w, h
 %endif
   mov r4d, dword wm
 %ifidn %2, highbd
   shl r4d, 1
-  shl srcq, 1
   shl src_strideq, 1
-  shl dstq, 1
   shl dst_strideq, 1
 %else
   cmp r4d, 4
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
index bfc816f235..d83507dc99 100644
--- a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
@@ -197,6 +197,8 @@
     movdqu      [rdi + %2], xmm0
 %endm
 
+SECTION .text
+
 ;void vpx_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
index 72f2ff71da..9bffe504b1 100644
--- a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
@@ -171,6 +171,8 @@
 %endm
 %endif
 
+SECTION .text
+
 global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE
 sym(vpx_highbd_filter_block1d4_v2_sse2):
     push        rbp
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 7c1ecc0148..d0919695ce 100644
--- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -12,9 +12,10 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_avx2.h"
 #include "vpx_ports/mem.h"
 
-// filters for 16_h8 and 16_v8
+// filters for 16_h8
 DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
@@ -35,493 +36,296 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
 };
 
-#if defined(__clang__)
-#if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
-    (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
-    (defined(__APPLE__) && defined(__apple_build_version__) && \
-     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
-      (__clang_major__ == 5 && __clang_minor__ == 0)))
-#define MM256_BROADCASTSI128_SI256(x) \
-  _mm_broadcastsi128_si256((__m128i const *)&(x))
-#else  // clang > 3.3, and not 5.0 on macosx.
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // clang <= 3.3
-#elif defined(__GNUC__)
-#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
-#define MM256_BROADCASTSI128_SI256(x) \
-  _mm_broadcastsi128_si256((__m128i const *)&(x))
-#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
-#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
-#else  // gcc > 4.7
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // gcc <= 4.6
-#else   // !(gcc || clang)
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // __clang__
-
-static void vpx_filter_block1d16_h8_avx2(
+static INLINE void vpx_filter_block1d16_h8_x_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
+    const int avg) {
+  __m128i outReg1, outReg2;
+  __m256i outReg32b1, outReg32b2;
   unsigned int i;
   ptrdiff_t src_stride, dst_stride;
+  __m256i f[4], filt[4], s[4];
 
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
-  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
-  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+  shuffle_filter_avx2(filter, f);
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
 
   // multiple the size of the source and destination stride by two
   src_stride = src_pixels_per_line << 1;
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
+    __m256i srcReg;
+
     // load the 2 strides of source
-    srcReg32b1 =
+    srcReg =
         _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
-    srcReg32b1 = _mm256_inserti128_si256(
-        srcReg32b1,
+    srcReg = _mm256_inserti128_si256(
+        srcReg,
         _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
         1);
 
     // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(
-        srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+    s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+    s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+    s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+    s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
+    outReg32b1 = convolve8_16_avx2(s, f);
 
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg32b2 =
+    srcReg =
         _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
-    srcReg32b2 = _mm256_inserti128_si256(
-        srcReg32b2,
+    srcReg = _mm256_inserti128_si256(
+        srcReg,
         _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
         1);
 
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(
-        srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
     // filter the source buffer
-    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+    s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+    s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+    s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+    s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
+    outReg32b2 = convolve8_16_avx2(s, f);
 
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(
-        srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
-    srcRegFilt32b2_1 = _mm256_adds_epi16(
-        srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
-
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
-    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+    // contain the first and second convolve result respectively
+    outReg32b1 = _mm256_packus_epi16(outReg32b1, outReg32b2);
 
     src_ptr += src_stride;
 
+    // average if necessary
+    outReg1 = _mm256_castsi256_si128(outReg32b1);
+    outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
+    if (avg) {
+      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+      outReg2 = _mm_avg_epu8(
+          outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
+    }
+
     // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr,
-                    _mm256_castsi256_si128(srcRegFilt32b1_1));
+    _mm_store_si128((__m128i *)output_ptr, outReg1);
 
     // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + output_pitch),
-                    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+    _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
+
     output_ptr += dst_stride;
   }
 
   // if the number of strides is odd.
   // process only 16 bytes
   if (i > 0) {
-    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
-    __m128i srcRegFilt2, srcRegFilt3;
+    __m128i srcReg;
 
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    // load the first 16 bytes of the last row
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
 
     // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    s[0] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
+    s[1] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
+    s[2] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
+    s[3] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
+    outReg1 = convolve8_8_avx2(s, f);
 
     // reading the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
-
-    // add and saturate the results together
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
 
     // filter the source buffer
-    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
+    s[0] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
+    s[1] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
+    s[2] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
+    s[3] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
+    outReg2 = convolve8_8_avx2(s, f);
 
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2_1 =
-        _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+    // contain the first and second convolve result respectively
+    outReg1 = _mm_packus_epi16(outReg1, outReg2);
 
-    // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64));
-
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64));
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
-    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+    // average if necessary
+    if (avg) {
+      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+    }
 
     // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
+    _mm_store_si128((__m128i *)output_ptr, outReg1);
   }
 }
 
-static void vpx_filter_block1d16_v8_avx2(
+static void vpx_filter_block1d16_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+    ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+  vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+                                 output_height, filter, 0);
+}
+
+static void vpx_filter_block1d16_h8_avg_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+    ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+  vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+                                 output_height, filter, 1);
+}
+
+static INLINE void vpx_filter_block1d16_v8_x_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg64;
-  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
-  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
-  __m256i srcReg32b11, srcReg32b12, filtersReg32;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
+    const int avg) {
+  __m128i outReg1, outReg2;
+  __m256i srcRegHead1;
   unsigned int i;
   ptrdiff_t src_stride, dst_stride;
+  __m256i f[4], s1[4], s2[4];
 
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+  shuffle_filter_avx2(filter, f);
 
   // multiple the size of the source and destination stride by two
   src_stride = src_pitch << 1;
   dst_stride = out_pitch << 1;
 
-  // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 =
-      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr)));
-  srcReg32b2 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
-  srcReg32b3 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
-  srcReg32b4 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
-  srcReg32b5 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
-  srcReg32b6 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
-  srcReg32b7 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+  {
+    __m128i s[6];
+    __m256i s32b[6];
 
-  // have each consecutive loads on the same 256 register
-  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
-                                       _mm256_castsi256_si128(srcReg32b2), 1);
-  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
-                                       _mm256_castsi256_si128(srcReg32b3), 1);
-  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
-                                       _mm256_castsi256_si128(srcReg32b4), 1);
-  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
-                                       _mm256_castsi256_si128(srcReg32b5), 1);
-  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
-                                       _mm256_castsi256_si128(srcReg32b6), 1);
-  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
-                                       _mm256_castsi256_si128(srcReg32b7), 1);
+    // load 16 bytes 7 times in stride of src_pitch
+    s[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_pitch));
+    s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_pitch));
+    s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_pitch));
+    s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_pitch));
+    s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_pitch));
+    s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_pitch));
+    srcRegHead1 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_pitch)));
 
-  // merge every two consecutive registers except the last one
-  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
-  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+    // have each consecutive loads on the same 256 register
+    s32b[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+    s32b[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+    s32b[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+    s32b[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+    s32b[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+    s32b[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]),
+                                      _mm256_castsi256_si128(srcRegHead1), 1);
 
-  // save
-  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-
-  // save
-  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
-
-  // save
-  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
-  // save
-  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+    // merge every two consecutive registers except the last one
+    // the first lanes contain values for filtering odd rows (1,3,5...) and
+    // the second lanes contain values for filtering even rows (2,4,6...)
+    s1[0] = _mm256_unpacklo_epi8(s32b[0], s32b[1]);
+    s2[0] = _mm256_unpackhi_epi8(s32b[0], s32b[1]);
+    s1[1] = _mm256_unpacklo_epi8(s32b[2], s32b[3]);
+    s2[1] = _mm256_unpackhi_epi8(s32b[2], s32b[3]);
+    s1[2] = _mm256_unpacklo_epi8(s32b[4], s32b[5]);
+    s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]);
+  }
 
   for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
+    __m256i srcRegHead2, srcRegHead3;
+
+    // load the next 2 loads of 16 bytes and have every two
     // consecutive loads in the same 256 bit register
-    srcReg32b8 = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
-    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
-                                         _mm256_castsi256_si128(srcReg32b8), 1);
-    srcReg32b9 = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
-    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
-                                         _mm256_castsi256_si128(srcReg32b9), 1);
+    srcRegHead2 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_pitch)));
+    srcRegHead1 = _mm256_inserti128_si256(
+        srcRegHead1, _mm256_castsi256_si128(srcRegHead2), 1);
+    srcRegHead3 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_pitch)));
+    srcRegHead2 = _mm256_inserti128_si256(
+        srcRegHead2, _mm256_castsi256_si128(srcRegHead3), 1);
 
-    // merge every two consecutive registers
-    // save
-    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
-    srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+    // merge the two new consecutive registers
+    // the first lane contain values for filtering odd rows (1,3,5...) and
+    // the second lane contain values for filtering even rows (2,4,6...)
+    s1[3] = _mm256_unpacklo_epi8(srcRegHead1, srcRegHead2);
+    s2[3] = _mm256_unpackhi_epi8(srcRegHead1, srcRegHead2);
 
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+    s1[0] = convolve8_16_avx2(s1, f);
+    s2[0] = convolve8_16_avx2(s2, f);
 
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_min_epi16(srcReg32b8, srcReg32b12));
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
-
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
-    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+    // contain the first and second convolve result respectively
+    s1[0] = _mm256_packus_epi16(s1[0], s2[0]);
 
     src_ptr += src_stride;
 
+    // average if necessary
+    outReg1 = _mm256_castsi256_si128(s1[0]);
+    outReg2 = _mm256_extractf128_si256(s1[0], 1);
+    if (avg) {
+      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+      outReg2 = _mm_avg_epu8(
+          outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
+    }
+
     // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
+    _mm_store_si128((__m128i *)output_ptr, outReg1);
 
     // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + out_pitch),
-                    _mm256_extractf128_si256(srcReg32b1, 1));
+    _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
 
     output_ptr += dst_stride;
 
-    // save part of the registers for next strides
-    srcReg32b10 = srcReg32b11;
-    srcReg32b1 = srcReg32b3;
-    srcReg32b11 = srcReg32b2;
-    srcReg32b3 = srcReg32b5;
-    srcReg32b2 = srcReg32b4;
-    srcReg32b5 = srcReg32b7;
-    srcReg32b7 = srcReg32b9;
+    // shift down by two rows
+    s1[0] = s1[1];
+    s2[0] = s2[1];
+    s1[1] = s1[2];
+    s2[1] = s2[2];
+    s1[2] = s1[3];
+    s2[2] = s2[3];
+    srcRegHead1 = srcRegHead3;
   }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
   if (i > 0) {
-    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
-    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
     // load the last 16 bytes
-    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+    const __m128i srcRegHead2 =
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
 
     // merge the last 2 results together
-    srcRegFilt4 =
-        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-    srcRegFilt7 =
-        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+    s1[0] = _mm256_castsi128_si256(
+        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
+    s2[0] = _mm256_castsi128_si256(
+        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
 
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt4 =
-        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
-    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt7 =
-        _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
+    outReg1 = convolve8_8_avx2(s1, f);
+    outReg2 = convolve8_8_avx2(s2, f);
 
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+    // contain the first and second convolve result respectively
+    outReg1 = _mm_packus_epi16(outReg1, outReg2);
 
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
-                                    _mm256_castsi256_si128(secondFilters));
-    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
-                                    _mm256_castsi256_si128(secondFilters));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
-                                    _mm256_castsi256_si128(thirdFilters));
-    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
-                                    _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7));
-
-    // add and saturate the results together
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7));
-
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64));
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+    // average if necessary
+    if (avg) {
+      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+    }
 
     // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
+    _mm_store_si128((__m128i *)output_ptr, outReg1);
   }
 }
 
+static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *filter) {
+  vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                 height, filter, 0);
+}
+
+static void vpx_filter_block1d16_v8_avg_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) {
+  vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                 height, filter, 1);
+}
+
 #if HAVE_AVX2 && HAVE_SSSE3
 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
 #if ARCH_X86_64
@@ -539,6 +343,14 @@ filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
 #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
 #endif  // ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+#define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3
+#define vpx_filter_block1d8_h8_avg_avx2 vpx_filter_block1d8_h8_avg_ssse3
+#define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3
+#define vpx_filter_block1d4_h8_avg_avx2 vpx_filter_block1d4_h8_avg_ssse3
 filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
 filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
 filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
@@ -552,23 +364,53 @@ filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
 #define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
 #define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
 #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+#define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3
+#define vpx_filter_block1d16_h2_avg_avx2 vpx_filter_block1d16_h2_avg_ssse3
+#define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3
+#define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3
+#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3
+#define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3
 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
+//                                const InterpKernel *filter, int x0_q4,
+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                int w, int h);
 // void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
+//                               const InterpKernel *filter, int x0_q4,
+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+// void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const InterpKernel *filter, int x0_q4,
+//                                    int32_t x_step_q4, int y0_q4,
+//                                    int y_step_q4, int w, int h);
+// void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const InterpKernel *filter, int x0_q4,
+//                                   int32_t x_step_q4, int y0_q4,
+//                                   int y_step_q4, int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2);
 
 // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
+//                          const InterpKernel *filter, int x0_q4,
+//                          int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                          int w, int h);
+// void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const InterpKernel *filter, int x0_q4,
+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                              int w, int h);
 FUN_CONV_2D(, avx2);
+FUN_CONV_2D(avg_, avx2);
 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index b26d97b455..e4f992780f 100644
--- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -8,52 +8,37 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <tmmintrin.h>
+#include <tmmintrin.h>  // SSSE3
+
+#include <string.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
 
 // These are reused by the avx2 intrinsics.
-filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+// vpx_filter_block1d8_v8_intrin_ssse3()
+// vpx_filter_block1d8_h8_intrin_ssse3()
+// vpx_filter_block1d4_h8_intrin_ssse3()
+
+static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
+    const __m128i *const s, const int16_t *const filter) {
+  __m128i f[4];
+  shuffle_filter_ssse3(filter, f);
+  return convolve8_8_ssse3(s, f);
+}
 
 void vpx_filter_block1d4_h8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
   __m128i firstFilters, secondFilters, shuffle1, shuffle2;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  __m128i srcRegFilt1, srcRegFilt2;
+  __m128i addFilterReg64, filtersReg, srcReg;
   unsigned int i;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -75,8 +60,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
   secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
 
   // loading the local filters
-  shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
-  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
+  shuffle1 = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6);
+  shuffle2 = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10);
 
   for (i = 0; i < output_height; i++) {
     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
@@ -89,25 +74,23 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
 
-    // extract the higher half of the lane
-    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
-    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
+    // sum the results together, saturating only on the final step
+    // the specific order of the additions prevents outranges
+    srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2);
 
-    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+    // extract the higher half of the register
+    srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
 
-    // add and saturate all the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+    // add the rounding offset early to avoid another saturated add
+    srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
 
     // shift by 7 bit each 16 bits
     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
 
     // shrink to 8 bit each 16 bits
     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-    src_ptr += src_pixels_per_line;
+    src_ptr += src_pitch;
 
     // save only 4 bytes
     *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
@@ -117,77 +100,35 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
 }
 
 void vpx_filter_block1d8_h8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
-  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, minReg;
   unsigned int i;
+  __m128i f[4], filt[4], s[4];
 
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 128 bit register
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 128 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 128 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 128 bit register
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+  shuffle_filter_ssse3(filter, f);
+  filt[0] = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  filt[1] = _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+  filt[2] = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
+  filt[3] =
+      _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
 
   for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
 
     // filter the source buffer
-    srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
-    srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
-    // add and saturate all the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+    s[0] = _mm_shuffle_epi8(srcReg, filt[0]);
+    s[1] = _mm_shuffle_epi8(srcReg, filt[1]);
+    s[2] = _mm_shuffle_epi8(srcReg, filt[2]);
+    s[3] = _mm_shuffle_epi8(srcReg, filt[3]);
+    s[0] = convolve8_8_ssse3(s, f);
 
     // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    s[0] = _mm_packus_epi16(s[0], s[0]);
 
-    src_ptr += src_pixels_per_line;
+    src_ptr += src_pitch;
 
     // save only 8 bytes
-    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
+    _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]);
 
     output_ptr += output_pitch;
   }
@@ -196,83 +137,49 @@ void vpx_filter_block1d8_h8_intrin_ssse3(
 void vpx_filter_block1d8_v8_intrin_ssse3(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, minReg;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
-  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
-  __m128i srcReg8;
   unsigned int i;
+  __m128i f[4], s[8], ss[4];
 
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits in the filter
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits in the filter
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits in the filter
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+  shuffle_filter_ssse3(filter, f);
 
   // load the first 7 rows of 8 bytes
-  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
-  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+  s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+  s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+  s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+  s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+  s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+  s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+  s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
 
   for (i = 0; i < output_height; i++) {
     // load the last 8 bytes
-    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
 
     // merge the result together
-    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
-    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+    ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+    ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
 
     // merge the result together
-    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
-    // add and saturate the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+    ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+    ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
 
+    ss[0] = convolve8_8_ssse3(ss, f);
     // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    ss[0] = _mm_packus_epi16(ss[0], ss[0]);
 
     src_ptr += src_pitch;
 
     // shift down a row
-    srcReg1 = srcReg2;
-    srcReg2 = srcReg3;
-    srcReg3 = srcReg4;
-    srcReg4 = srcReg5;
-    srcReg5 = srcReg6;
-    srcReg6 = srcReg7;
-    srcReg7 = srcReg8;
+    s[0] = s[1];
+    s[1] = s[2];
+    s[2] = s[3];
+    s[3] = s[4];
+    s[4] = s[5];
+    s[5] = s[6];
+    s[6] = s[7];
 
     // save only 8 bytes convolve result
-    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
+    _mm_storel_epi64((__m128i *)&output_ptr[0], ss[0]);
 
     output_ptr += out_pitch;
   }
@@ -306,149 +213,69 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
 
 // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
+//                                const InterpKernel *filter, int x0_q4,
+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                int w, int h);
 // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
+//                               const InterpKernel *filter, int x0_q4,
+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                               int w, int h);
 // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h);
+//                                    const InterpKernel *filter, int x0_q4,
+//                                    int32_t x_step_q4, int y0_q4,
+//                                    int y_step_q4, int w, int h);
 // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-            ssse3);
+//                                   const InterpKernel *filter, int x0_q4,
+//                                   int32_t x_step_q4, int y0_q4,
+//                                   int y_step_q4, int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3);
 
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
-    const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
-    const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
-    const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
-    const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
-                                                                          \
-    const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
-    const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
-    const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
-    const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
-    out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
-    out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
-    out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
-    out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
-    out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
-    out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
-    out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
-  }
+static void filter_horiz_w8_ssse3(const uint8_t *const src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const x_filter) {
+  __m128i s[8], ss[4], temp;
 
-static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *x_filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
-  const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
-  // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
-  const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
-  const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
-  const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
-  const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
-  const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
-  const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
+  load_8bit_8x8(src, src_stride, s);
+  // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+  // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+  // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+  // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+  transpose_16bit_4x8(s, ss);
+  temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
   // shrink to 8 bit each 16 bits
   temp = _mm_packus_epi16(temp, temp);
   // save only 8 bytes convolve result
   _mm_storel_epi64((__m128i *)dst, temp);
 }
 
-static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A, B, C, D, E, F, G, H;
+static void transpose8x8_to_dst(const uint8_t *const src,
+                                const ptrdiff_t src_stride, uint8_t *const dst,
+                                const ptrdiff_t dst_stride) {
+  __m128i s[8];
 
-  A = _mm_loadl_epi64((const __m128i *)src);
-  B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
-  C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
-  D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
-  E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
-  F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
-  G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
-  H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
-
-  TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
-
-  _mm_storel_epi64((__m128i *)dst, A);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
+  load_8bit_8x8(src, src_stride, s);
+  transpose_8bit_8x8(s, s);
+  store_8bit_8x8(s, dst, dst_stride);
 }
 
-static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters, int x0_q4,
-                                    int x_step_q4, int w, int h) {
+static void scaledconvolve_horiz_w8(const uint8_t *src,
+                                    const ptrdiff_t src_stride, uint8_t *dst,
+                                    const ptrdiff_t dst_stride,
+                                    const InterpKernel *const x_filters,
+                                    const int x0_q4, const int x_step_q4,
+                                    const int w, const int h) {
   DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
   int x, y, z;
   src -= SUBPEL_TAPS / 2 - 1;
 
-  // This function processes 8x8 areas.  The intermediate height is not always
+  // This function processes 8x8 areas. The intermediate height is not always
   // a multiple of 8, so force it to be a multiple of 8 here.
   y = h + (8 - (h & 0x7));
 
@@ -479,93 +306,50 @@ static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
   } while (y -= 8);
 }
 
-static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  // TRANSPOSE...
-  // 00 01 02 03 04 05 06 07
-  // 10 11 12 13 14 15 16 17
-  // 20 21 22 23 24 25 26 27
-  // 30 31 32 33 34 35 36 37
-  //
-  // TO
-  //
-  // 00 10 20 30
-  // 01 11 21 31
-  // 02 12 22 32
-  // 03 13 23 33
-  // 04 14 24 34
-  // 05 15 25 35
-  // 06 16 26 36
-  // 07 17 27 37
-  //
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+static void filter_horiz_w4_ssse3(const uint8_t *const src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const filter) {
+  __m128i s[4], ss[2];
+  __m128i temp;
+
+  load_8bit_8x4(src, src_stride, s);
+  transpose_16bit_4x4(s, ss);
+  // 00 01 10 11 20 21 30 31
+  s[0] = ss[0];
   // 02 03 12 13 22 23 32 33
-  const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
+  s[1] = _mm_srli_si128(ss[0], 8);
+  // 04 05 14 15 24 25 34 35
+  s[2] = ss[1];
   // 06 07 16 17 26 27 36 37
-  const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
+  s[3] = _mm_srli_si128(ss[1], 8);
+
+  temp = shuffle_filter_convolve8_8_ssse3(s, filter);
   // shrink to 8 bit each 16 bits
   temp = _mm_packus_epi16(temp, temp);
   // save only 4 bytes
   *(int *)dst = _mm_cvtsi128_si32(temp);
 }
 
-static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A = _mm_cvtsi32_si128(*(const int *)src);
-  __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
-  __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
-  __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
-  // 00 10 01 11 02 12 03 13
-  const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
-  // 20 30 21 31 22 32 23 33
-  const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
-  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  A = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  B = _mm_srli_si128(A, 4);
-  C = _mm_srli_si128(A, 8);
-  D = _mm_srli_si128(A, 12);
+static void transpose4x4_to_dst(const uint8_t *const src,
+                                const ptrdiff_t src_stride, uint8_t *const dst,
+                                const ptrdiff_t dst_stride) {
+  __m128i s[4];
 
-  *(int *)(dst) = _mm_cvtsi128_si32(A);
-  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
-  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
-  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
+  load_8bit_4x4(src, src_stride, s);
+  s[0] = transpose_8bit_4x4(s);
+  s[1] = _mm_srli_si128(s[0], 4);
+  s[2] = _mm_srli_si128(s[0], 8);
+  s[3] = _mm_srli_si128(s[0], 12);
+  store_8bit_4x4(s, dst, dst_stride);
 }
 
-static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters, int x0_q4,
-                                    int x_step_q4, int w, int h) {
+static void scaledconvolve_horiz_w4(const uint8_t *src,
+                                    const ptrdiff_t src_stride, uint8_t *dst,
+                                    const ptrdiff_t dst_stride,
+                                    const InterpKernel *const x_filters,
+                                    const int x0_q4, const int x_step_q4,
+                                    const int w, const int h) {
   DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
   int x, y, z;
   src -= SUBPEL_TAPS / 2 - 1;
@@ -597,50 +381,41 @@ static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
-  const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
-  const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
+static __m128i filter_vert_kernel(const __m128i *const s,
+                                  const int16_t *const filter) {
+  __m128i ss[4];
+  __m128i temp;
+
+  // 00 10 01 11 02 12 03 13
+  ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  // 20 30 21 31 22 32 23 33
+  ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+  // 40 50 41 51 42 52 43 53
+  ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+  // 60 70 61 71 62 72 63 73
+  ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+
+  temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
   // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
+  return _mm_packus_epi16(temp, temp);
+}
+
+static void filter_vert_w4_ssse3(const uint8_t *const src,
+                                 const ptrdiff_t src_stride, uint8_t *const dst,
+                                 const int16_t *const filter) {
+  __m128i s[8];
+  __m128i temp;
+
+  load_8bit_4x8(src, src_stride, s);
+  temp = filter_vert_kernel(s, filter);
   // save only 4 bytes
   *(int *)dst = _mm_cvtsi128_si32(temp);
 }
 
-static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters, int y0_q4,
-                                   int y_step_q4, int w, int h) {
+static void scaledconvolve_vert_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
   int y;
   int y_q4 = y0_q4;
 
@@ -659,50 +434,21 @@ static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
+static void filter_vert_w8_ssse3(const uint8_t *const src,
+                                 const ptrdiff_t src_stride, uint8_t *const dst,
+                                 const int16_t *const filter) {
+  __m128i s[8], temp;
+
+  load_8bit_8x8(src, src_stride, s);
+  temp = filter_vert_kernel(s, filter);
   // save only 8 bytes convolve result
   _mm_storel_epi64((__m128i *)dst, temp);
 }
 
-static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters, int y0_q4,
-                                   int y_step_q4, int w, int h) {
+static void scaledconvolve_vert_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
   int y;
   int y_q4 = y0_q4;
 
@@ -719,81 +465,44 @@ static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter, int w) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+static void filter_vert_w16_ssse3(const uint8_t *src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const filter, const int w) {
   int i;
+  __m128i f[4];
+  shuffle_filter_ssse3(filter, f);
 
   for (i = 0; i < w; i += 16) {
-    const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
-    const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-    const __m128i C =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-    const __m128i D =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-    const __m128i E =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-    const __m128i F =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-    const __m128i G =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-    const __m128i H =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-    // merge the result together
-    const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
-    const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
-    const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
-    const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
-    const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
-    const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
-    const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
-    // add and saturate the results together
-    const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
-    const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
-    // merge the result together
-    const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
-    const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
-    const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
-    // merge the result together
-    const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
-    const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
-    const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
-    // add and saturate the results together
-    __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
-    __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
+    __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
 
-    // add and saturate the results together
-    temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
-    temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
-    // round and shift by 7 bit each 16 bit
-    temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
-    temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
+    loadu_8bit_16x8(src, src_stride, s);
+
+    // merge the result together
+    s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
+    s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
+    s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
+    s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
+    s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
+    s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
+    s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
+    s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
+    temp_lo = convolve8_8_ssse3(s_lo, f);
+    temp_hi = convolve8_8_ssse3(s_hi, f);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first convolve
+    // result and the second lane contain the second convolve result
     temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
-    src_ptr += 16;
+    src += 16;
     // save 16 bytes convolve result
     _mm_store_si128((__m128i *)&dst[i], temp_hi);
   }
 }
 
-static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *y_filters, int y0_q4,
-                                    int y_step_q4, int w, int h) {
+static void scaledconvolve_vert_w16(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
   int y;
   int y_q4 = y0_q4;
 
@@ -811,11 +520,10 @@ static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters, int x0_q4,
-                             int x_step_q4, const InterpKernel *const y_filters,
-                             int y0_q4, int y_step_q4, int w, int h) {
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                         int w, int h) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -829,70 +537,49 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
   DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
   const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
   assert(w <= 64);
   assert(h <= 64);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
 
   if (w >= 8) {
     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
   } else {
     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
   }
 
   if (w >= 16) {
     scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+                            dst_stride, filter, y0_q4, y_step_q4, w, h);
   } else if (w == 8) {
     scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
   } else {
     scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
   }
 }
 
-static const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
+//                          const InterpKernel *filter, int x0_q4,
+//                          int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                          int w, int h);
 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
+//                              const InterpKernel *filter, int x0_q4,
+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
 FUN_CONV_2D(, ssse3);
 FUN_CONV_2D(avg_, ssse3);
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
index 08f3d6a6cf..8497e1721b 100644
--- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
@@ -176,6 +176,8 @@
     movq        [rdi + %2], xmm0
 %endm
 
+SECTION .text
+
 ;void vpx_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
index c1a6f23abe..952d9307d1 100644
--- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -327,12 +327,12 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
 %endm
 
 INIT_XMM ssse3
-SUBPIX_HFILTER16 h8
-SUBPIX_HFILTER16 h8_avg
-SUBPIX_HFILTER8  h8
-SUBPIX_HFILTER8  h8_avg
-SUBPIX_HFILTER4  h8
-SUBPIX_HFILTER4  h8_avg
+SUBPIX_HFILTER16 h8      ; vpx_filter_block1d16_h8_ssse3
+SUBPIX_HFILTER16 h8_avg  ; vpx_filter_block1d16_h8_avg_ssse3
+SUBPIX_HFILTER8  h8      ; vpx_filter_block1d8_h8_ssse3
+SUBPIX_HFILTER8  h8_avg  ; vpx_filter_block1d8_h8_avg_ssse3
+SUBPIX_HFILTER4  h8      ; vpx_filter_block1d4_h8_ssse3
+SUBPIX_HFILTER4  h8_avg  ; vpx_filter_block1d4_h8_avg_ssse3
 
 ;-------------------------------------------------------------------------------
 
@@ -795,9 +795,9 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
 %endm
 
 INIT_XMM ssse3
-SUBPIX_VFILTER16     v8
-SUBPIX_VFILTER16 v8_avg
-SUBPIX_VFILTER       v8, 8
-SUBPIX_VFILTER   v8_avg, 8
-SUBPIX_VFILTER       v8, 4
-SUBPIX_VFILTER   v8_avg, 4
+SUBPIX_VFILTER16     v8     ; vpx_filter_block1d16_v8_ssse3
+SUBPIX_VFILTER16 v8_avg     ; vpx_filter_block1d16_v8_avg_ssse3
+SUBPIX_VFILTER       v8, 8  ; vpx_filter_block1d8_v8_ssse3
+SUBPIX_VFILTER   v8_avg, 8  ; vpx_filter_block1d8_v8_avg_ssse3
+SUBPIX_VFILTER       v8, 4  ; vpx_filter_block1d4_v8_ssse3
+SUBPIX_VFILTER   v8_avg, 4  ; vpx_filter_block1d4_v8_avg_ssse3
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
index a378dd0402..6d79492e4a 100644
--- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
@@ -131,6 +131,8 @@
     dec         rcx
 %endm
 
+SECTION .text
+
 global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
 sym(vpx_filter_block1d4_v2_sse2):
     push        rbp
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
index 538b2129db..8c9c817be7 100644
--- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
@@ -105,6 +105,8 @@
     dec         rcx
 %endm
 
+SECTION .text
+
 global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
 sym(vpx_filter_block1d4_v2_ssse3):
     push        rbp
diff --git a/libs/libvpx/vpx_mem/vpx_mem.c b/libs/libvpx/vpx_mem/vpx_mem.c
index c94ed52d16..eeba34c373 100644
--- a/libs/libvpx/vpx_mem/vpx_mem.c
+++ b/libs/libvpx/vpx_mem/vpx_mem.c
@@ -76,50 +76,9 @@ void *vpx_calloc(size_t num, size_t size) {
   return x;
 }
 
-void *vpx_realloc(void *memblk, size_t size) {
-  void *new_addr = NULL;
-
-  /*
-  The realloc() function changes the size of the object pointed to by
-  ptr to the size specified by size, and returns a pointer to the
-  possibly moved block. The contents are unchanged up to the lesser
-  of the new and old sizes. If ptr is null, realloc() behaves like
-  malloc() for the specified size. If size is zero (0) and ptr is
-  not a null pointer, the object pointed to is freed.
-  */
-  if (!memblk)
-    new_addr = vpx_malloc(size);
-  else if (!size)
-    vpx_free(memblk);
-  else {
-    void *addr = get_actual_malloc_address(memblk);
-    const uint64_t aligned_size =
-        get_aligned_malloc_size(size, DEFAULT_ALIGNMENT);
-    if (!check_size_argument_overflow(1, aligned_size)) return NULL;
-
-    addr = realloc(addr, (size_t)aligned_size);
-    if (addr) {
-      new_addr = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE,
-                            DEFAULT_ALIGNMENT);
-      set_actual_malloc_address(new_addr, addr);
-    }
-  }
-
-  return new_addr;
-}
-
 void vpx_free(void *memblk) {
   if (memblk) {
     void *addr = get_actual_malloc_address(memblk);
     free(addr);
   }
 }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void *vpx_memset16(void *dest, int val, size_t length) {
-  size_t i;
-  uint16_t *dest16 = (uint16_t *)dest;
-  for (i = 0; i < length; i++) *dest16++ = val;
-  return dest;
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_mem/vpx_mem.h b/libs/libvpx/vpx_mem/vpx_mem.h
index c14f288b89..a4274b8856 100644
--- a/libs/libvpx/vpx_mem/vpx_mem.h
+++ b/libs/libvpx/vpx_mem/vpx_mem.h
@@ -19,6 +19,8 @@
 #include <stdlib.h>
 #include <stddef.h>
 
+#include "vpx/vpx_integer.h"
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
@@ -26,11 +28,15 @@ extern "C" {
 void *vpx_memalign(size_t align, size_t size);
 void *vpx_malloc(size_t size);
 void *vpx_calloc(size_t num, size_t size);
-void *vpx_realloc(void *memblk, size_t size);
 void vpx_free(void *memblk);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void *vpx_memset16(void *dest, int val, size_t length);
+static INLINE void *vpx_memset16(void *dest, int val, size_t length) {
+  size_t i;
+  uint16_t *dest16 = (uint16_t *)dest;
+  for (i = 0; i < length; i++) *dest16++ = val;
+  return dest;
+}
 #endif
 
 #include <string.h>
diff --git a/libs/libvpx/vpx_ports/arm_cpudetect.c b/libs/libvpx/vpx_ports/arm_cpudetect.c
index 79c60f7a19..4f9d480ade 100644
--- a/libs/libvpx/vpx_ports/arm_cpudetect.c
+++ b/libs/libvpx/vpx_ports/arm_cpudetect.c
@@ -58,8 +58,12 @@ int arm_cpu_caps(void) {
 
 #elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
 /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+#ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
+#endif
+#ifndef WIN32_EXTRA_LEAN
 #define WIN32_EXTRA_LEAN
+#endif
 #include <windows.h>
 
 int arm_cpu_caps(void) {
diff --git a/libs/libvpx/vpx_ports/asmdefs_mmi.h b/libs/libvpx/vpx_ports/asmdefs_mmi.h
new file mode 100644
index 0000000000..a9a49745af
--- /dev/null
+++ b/libs/libvpx/vpx_ports/asmdefs_mmi.h
@@ -0,0 +1,81 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_ASMDEFS_MMI_H_
+#define VPX_PORTS_ASMDEFS_MMI_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#if HAVE_MMI
+
+#if HAVE_MIPS64
+#define mips_reg int64_t
+#define MMI_ADDU(reg1, reg2, reg3) \
+  "daddu       " #reg1 ",       " #reg2 ",       " #reg3 "         \n\t"
+
+#define MMI_ADDIU(reg1, reg2, immediate) \
+  "daddiu      " #reg1 ",       " #reg2 ",       " #immediate "    \n\t"
+
+#define MMI_ADDI(reg1, reg2, immediate) \
+  "daddi       " #reg1 ",       " #reg2 ",       " #immediate "    \n\t"
+
+#define MMI_SUBU(reg1, reg2, reg3) \
+  "dsubu       " #reg1 ",       " #reg2 ",       " #reg3 "         \n\t"
+
+#define MMI_L(reg, addr, bias) \
+  "ld          " #reg ",        " #bias "(" #addr ")               \n\t"
+
+#define MMI_SRL(reg1, reg2, shift) \
+  "dsrl        " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
+
+#define MMI_SLL(reg1, reg2, shift) \
+  "dsll        " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
+
+#define MMI_MTC1(reg, fp) \
+  "dmtc1       " #reg ",        " #fp "                            \n\t"
+
+#define MMI_LI(reg, immediate) \
+  "dli         " #reg ",        " #immediate "                     \n\t"
+
+#else
+#define mips_reg int32_t
+#define MMI_ADDU(reg1, reg2, reg3) \
+  "addu        " #reg1 ",       " #reg2 ",       " #reg3 "         \n\t"
+
+#define MMI_ADDIU(reg1, reg2, immediate) \
+  "addiu       " #reg1 ",       " #reg2 ",       " #immediate "    \n\t"
+
+#define MMI_ADDI(reg1, reg2, immediate) \
+  "addi        " #reg1 ",       " #reg2 ",       " #immediate "    \n\t"
+
+#define MMI_SUBU(reg1, reg2, reg3) \
+  "subu        " #reg1 ",       " #reg2 ",       " #reg3 "         \n\t"
+
+#define MMI_L(reg, addr, bias) \
+  "lw          " #reg ",        " #bias "(" #addr ")               \n\t"
+
+#define MMI_SRL(reg1, reg2, shift) \
+  "srl         " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
+
+#define MMI_SLL(reg1, reg2, shift) \
+  "sll         " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
+
+#define MMI_MTC1(reg, fp) \
+  "mtc1        " #reg ",        " #fp "                            \n\t"
+
+#define MMI_LI(reg, immediate) \
+  "li          " #reg ",        " #immediate "                     \n\t"
+
+#endif /* HAVE_MIPS64 */
+
+#endif /* HAVE_MMI */
+
+#endif /* VPX_PORTS_ASMDEFS_MMI_H_ */
diff --git a/libs/libvpx/vpx_ports/mem.h b/libs/libvpx/vpx_ports/mem.h
index 2d49b7a06d..bfef783b13 100644
--- a/libs/libvpx/vpx_ports/mem.h
+++ b/libs/libvpx/vpx_ports/mem.h
@@ -23,16 +23,6 @@
 #define DECLARE_ALIGNED(n, typ, val) typ val
 #endif
 
-/* Indicates that the usage of the specified variable has been audited to assure
- * that it's safe to use uninitialized. Silences 'may be used uninitialized'
- * warnings on gcc.
- */
-#if defined(__GNUC__) && __GNUC__
-#define UNINITIALIZED_IS_SAFE(x) x = x
-#else
-#define UNINITIALIZED_IS_SAFE(x) x
-#endif
-
 #if HAVE_NEON && defined(_MSC_VER)
 #define __builtin_prefetch(x)
 #endif
@@ -45,8 +35,10 @@
   (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
 
 #define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
+#define CAST_TO_SHORTPTR(x) ((uint16_t *)((uintptr_t)(x)))
 #if CONFIG_VP9_HIGHBITDEPTH
 #define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
+#define CAST_TO_BYTEPTR(x) ((uint8_t *)((uintptr_t)(x)))
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if !defined(__has_feature)
diff --git a/libs/libvpx/vpx_ports/ppc.h b/libs/libvpx/vpx_ports/ppc.h
new file mode 100644
index 0000000000..ed29ef25b4
--- /dev/null
+++ b/libs/libvpx/vpx_ports/ppc.h
@@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_PPC_H_
+#define VPX_PORTS_PPC_H_
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HAS_VSX 0x01
+
+int ppc_simd_caps(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_PORTS_PPC_H_
diff --git a/libs/libvpx/vpx_ports/ppc_cpudetect.c b/libs/libvpx/vpx_ports/ppc_cpudetect.c
new file mode 100644
index 0000000000..374a0271c9
--- /dev/null
+++ b/libs/libvpx/vpx_ports/ppc_cpudetect.c
@@ -0,0 +1,80 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <asm/cputable.h>
+#include <linux/auxvec.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/ppc.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+static int cpu_env_flags(int *flags) {
+  char *env;
+  env = getenv("VPX_SIMD_CAPS");
+  if (env && *env) {
+    *flags = (int)strtol(env, NULL, 0);
+    return 0;
+  }
+  *flags = 0;
+  return -1;
+}
+
+static int cpu_env_mask(void) {
+  char *env;
+  env = getenv("VPX_SIMD_CAPS_MASK");
+  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
+
+int ppc_simd_caps(void) {
+  int flags;
+  int mask;
+  int fd;
+  ssize_t count;
+  unsigned int i;
+  uint64_t buf[64];
+
+  // If VPX_SIMD_CAPS is set then allow only those capabilities.
+  if (!cpu_env_flags(&flags)) {
+    return flags;
+  }
+
+  mask = cpu_env_mask();
+
+  fd = open("/proc/self/auxv", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+
+  while ((count = read(fd, buf, sizeof(buf))) > 0) {
+    for (i = 0; i < (count / sizeof(*buf)); i += 2) {
+      if (buf[i] == AT_HWCAP) {
+#if HAVE_VSX
+        if (buf[i + 1] & PPC_FEATURE_HAS_VSX) {
+          flags |= HAS_VSX;
+        }
+#endif  // HAVE_VSX
+        goto out_close;
+      } else if (buf[i] == AT_NULL) {
+        goto out_close;
+      }
+    }
+  }
+out_close:
+  close(fd);
+  return flags & mask;
+}
+#else
+// If there is no RTCD the function pointers are not used and can not be
+// changed.
+int ppc_simd_caps(void) { return 0; }
+#endif  // CONFIG_RUNTIME_CPU_DETECT
diff --git a/libs/libvpx/vpx_ports/vpx_ports.mk b/libs/libvpx/vpx_ports/vpx_ports.mk
index 36b14936df..e17145e6cb 100644
--- a/libs/libvpx/vpx_ports/vpx_ports.mk
+++ b/libs/libvpx/vpx_ports/vpx_ports.mk
@@ -25,3 +25,10 @@ endif
 
 PORTS_SRCS-$(ARCH_ARM) += arm_cpudetect.c
 PORTS_SRCS-$(ARCH_ARM) += arm.h
+
+PORTS_SRCS-$(ARCH_PPC) += ppc_cpudetect.c
+PORTS_SRCS-$(ARCH_PPC) += ppc.h
+
+ifeq ($(ARCH_MIPS), yes)
+PORTS_SRCS-yes += asmdefs_mmi.h
+endif
diff --git a/libs/libvpx/vpx_ports/vpx_timer.h b/libs/libvpx/vpx_ports/vpx_timer.h
index 4aae30e947..2083b4ece4 100644
--- a/libs/libvpx/vpx_ports/vpx_timer.h
+++ b/libs/libvpx/vpx_ports/vpx_timer.h
@@ -21,6 +21,8 @@
 /*
  * Win32 specific includes
  */
+#undef NOMINMAX
+#define NOMINMAX
 #ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
 #endif
@@ -81,7 +83,7 @@ static INLINE int64_t vpx_usec_timer_elapsed(struct vpx_usec_timer *t) {
   struct timeval diff;
 
   timersub(&t->end, &t->begin, &diff);
-  return diff.tv_sec * 1000000 + diff.tv_usec;
+  return (int64_t)diff.tv_sec * 1000000 + diff.tv_usec;
 #endif
 }
 
diff --git a/libs/libvpx/vpx_ports/x86.h b/libs/libvpx/vpx_ports/x86.h
index 6ba02cf1fc..ced65ac058 100644
--- a/libs/libvpx/vpx_ports/x86.h
+++ b/libs/libvpx/vpx_ports/x86.h
@@ -140,22 +140,28 @@ static INLINE uint64_t xgetbv(void) {
 #endif
 
 #if defined(_MSC_VER) && _MSC_VER >= 1700
+#undef NOMINMAX
+#define NOMINMAX
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 #if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP)
 #define getenv(x) NULL
 #endif
 #endif
 
-#define HAS_MMX 0x01
-#define HAS_SSE 0x02
-#define HAS_SSE2 0x04
-#define HAS_SSE3 0x08
-#define HAS_SSSE3 0x10
-#define HAS_SSE4_1 0x20
-#define HAS_AVX 0x40
-#define HAS_AVX2 0x80
+#define HAS_MMX 0x001
+#define HAS_SSE 0x002
+#define HAS_SSE2 0x004
+#define HAS_SSE3 0x008
+#define HAS_SSSE3 0x010
+#define HAS_SSE4_1 0x020
+#define HAS_AVX 0x040
+#define HAS_AVX2 0x080
+#define HAS_AVX512 0x100
 #ifndef BIT
-#define BIT(n) (1 << n)
+#define BIT(n) (1u << n)
 #endif
 
 static INLINE int x86_simd_caps(void) {
@@ -204,6 +210,12 @@ static INLINE int x86_simd_caps(void) {
         cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
 
         if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+
+        // bits 16 (AVX-512F) & 17 (AVX-512DQ) & 28 (AVX-512CD) &
+        // 30 (AVX-512BW) & 32 (AVX-512VL)
+        if ((reg_ebx & (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))) ==
+            (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31)))
+          flags |= HAS_AVX512;
       }
     }
   }
diff --git a/libs/libvpx/vpx_scale/generic/yv12config.c b/libs/libvpx/vpx_scale/generic/yv12config.c
index a674eac84b..9c7ca42c78 100644
--- a/libs/libvpx/vpx_scale/generic/yv12config.c
+++ b/libs/libvpx/vpx_scale/generic/yv12config.c
@@ -9,6 +9,7 @@
  */
 
 #include <assert.h>
+#include <limits.h>
 
 #include "vpx_scale/yv12config.h"
 #include "vpx_mem/vpx_mem.h"
@@ -165,6 +166,12 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
 
     uint8_t *buf = NULL;
 
+    // frame_size is stored in buffer_alloc_sz, which is an int. If it won't
+    // fit, fail early.
+    if (frame_size > INT_MAX) {
+      return -1;
+    }
+
     if (cb != NULL) {
       const int align_addr_extra_size = 31;
       const uint64_t external_frame_size = frame_size + align_addr_extra_size;
@@ -193,8 +200,6 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
       vpx_free(ybf->buffer_alloc);
       ybf->buffer_alloc = NULL;
 
-      if (frame_size != (size_t)frame_size) return -1;
-
       ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, (size_t)frame_size);
       if (!ybf->buffer_alloc) return -1;
 
diff --git a/libs/libvpx/vpx_scale/generic/yv12extend.c b/libs/libvpx/vpx_scale/generic/yv12extend.c
index a6aaff95a0..e231806505 100644
--- a/libs/libvpx/vpx_scale/generic/yv12extend.c
+++ b/libs/libvpx/vpx_scale/generic/yv12extend.c
@@ -111,25 +111,6 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
-                      ybf->y_crop_height, ybf->border, ybf->border,
-                      ybf->border + ybf->y_height - ybf->y_crop_height,
-                      ybf->border + ybf->y_width - ybf->y_crop_width);
-
-    extend_plane_high(ybf->u_buffer, ybf->uv_stride, ybf->uv_crop_width,
-                      ybf->uv_crop_height, uv_border, uv_border,
-                      uv_border + ybf->uv_height - ybf->uv_crop_height,
-                      uv_border + ybf->uv_width - ybf->uv_crop_width);
-
-    extend_plane_high(ybf->v_buffer, ybf->uv_stride, ybf->uv_crop_width,
-                      ybf->uv_crop_height, uv_border, uv_border,
-                      uv_border + ybf->uv_height - ybf->uv_crop_height,
-                      uv_border + ybf->uv_width - ybf->uv_crop_width);
-    return;
-  }
-#endif
   extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
                ybf->y_crop_height, ybf->border, ybf->border,
                ybf->border + ybf->y_height - ybf->y_crop_height,
@@ -208,12 +189,55 @@ static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
 // Copies the source image into the destination image and updates the
 // destination's UMV borders.
 // Note: The frames are assumed to be identical in size.
+
 void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
                            YV12_BUFFER_CONFIG *dst_ybc) {
   int row;
   const uint8_t *src = src_ybc->y_buffer;
   uint8_t *dst = dst_ybc->y_buffer;
 
+#if 0
+  /* These assertions are valid in the codec, but the libvpx-tester uses
+   * this code slightly differently.
+   */
+  assert(src_ybc->y_width == dst_ybc->y_width);
+  assert(src_ybc->y_height == dst_ybc->y_height);
+#endif
+
+  for (row = 0; row < src_ybc->y_height; ++row) {
+    memcpy(dst, src, src_ybc->y_width);
+    src += src_ybc->y_stride;
+    dst += dst_ybc->y_stride;
+  }
+
+  src = src_ybc->u_buffer;
+  dst = dst_ybc->u_buffer;
+
+  for (row = 0; row < src_ybc->uv_height; ++row) {
+    memcpy(dst, src, src_ybc->uv_width);
+    src += src_ybc->uv_stride;
+    dst += dst_ybc->uv_stride;
+  }
+
+  src = src_ybc->v_buffer;
+  dst = dst_ybc->v_buffer;
+
+  for (row = 0; row < src_ybc->uv_height; ++row) {
+    memcpy(dst, src, src_ybc->uv_width);
+    src += src_ybc->uv_stride;
+    dst += dst_ybc->uv_stride;
+  }
+
+  vp8_yv12_extend_frame_borders_c(dst_ybc);
+}
+
+#if CONFIG_VP9
+void vpx_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc) {
+  int row;
+  const uint8_t *src = src_ybc->y_buffer;
+  uint8_t *dst = dst_ybc->y_buffer;
+
 #if 0
   /* These assertions are valid in the codec, but the libvpx-tester uses
    * this code slightly differently.
@@ -249,7 +273,7 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
       dst += dst_ybc->uv_stride;
     }
 
-    vp8_yv12_extend_frame_borders_c(dst_ybc);
+    vpx_extend_frame_borders_c(dst_ybc);
     return;
   } else {
     assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH));
@@ -280,8 +304,9 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
     dst += dst_ybc->uv_stride;
   }
 
-  vp8_yv12_extend_frame_borders_c(dst_ybc);
+  vpx_extend_frame_borders_c(dst_ybc);
 }
+#endif  // CONFIG_VP9
 
 void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
                        YV12_BUFFER_CONFIG *dst_ybc) {
diff --git a/libs/libvpx/vpx_scale/vpx_scale_rtcd.pl b/libs/libvpx/vpx_scale/vpx_scale_rtcd.pl
index 44b115c7eb..1281071a7d 100644
--- a/libs/libvpx/vpx_scale/vpx_scale_rtcd.pl
+++ b/libs/libvpx/vpx_scale/vpx_scale_rtcd.pl
@@ -1,3 +1,13 @@
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
 sub vpx_scale_forward_decls() {
 print <<EOF
 struct yv12_buffer_config;
@@ -23,6 +33,8 @@ add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_yb
 add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
 
 if (vpx_config("CONFIG_VP9") eq "yes") {
+    add_proto qw/void vpx_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+
     add_proto qw/void vpx_extend_frame_borders/, "struct yv12_buffer_config *ybf";
     specialize qw/vpx_extend_frame_borders dspr2/;
 
diff --git a/libs/libvpx/vpx_util/vpx_atomics.h b/libs/libvpx/vpx_util/vpx_atomics.h
new file mode 100644
index 0000000000..b8cf80daeb
--- /dev/null
+++ b/libs/libvpx/vpx_util/vpx_atomics.h
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_UTIL_VPX_ATOMICS_H_
+#define VPX_UTIL_VPX_ATOMICS_H_
+
+#include "./vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
+
+// Look for built-in atomic support. We cannot use <stdatomic.h> or <atomic>
+// since neither is guaranteed to exist on both C and C++ platforms, and we need
+// to back the atomic type with the same type (g++ needs to be able to use
+// gcc-built code). g++ 6 doesn't support _Atomic as a keyword and can't use the
+// stdatomic.h header. Even if both <stdatomic.h> and <atomic> existed it's not
+// guaranteed that atomic_int is the same type as std::atomic_int.
+// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60932#c13.
+#if !defined(__has_builtin)
+#define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif                      // !defined(__has_builtin)
+
+#if (__has_builtin(__atomic_load_n)) || \
+    (defined(__GNUC__) &&               \
+     (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
+// For GCC >= 4.7 and Clang versions that support __atomic builtins, use those.
+#define VPX_USE_ATOMIC_BUILTINS
+#else
+// Use platform-specific asm barriers.
+#if defined(_MSC_VER)
+// TODO(pbos): This assumes that newer versions of MSVC are building with the
+// default /volatile:ms (or older, where this is always true. Consider adding
+// support for using <atomic> instead of stdatomic.h when building C++11 under
+// MSVC. It's unclear what to do for plain C under /volatile:iso (inline asm?),
+// there're no explicit Interlocked* functions for only storing or loading
+// (presumably because volatile has historically implied that on MSVC).
+//
+// For earlier versions of MSVC or the default /volatile:ms volatile int are
+// acquire/release and require no barrier.
+#define vpx_atomic_memory_barrier() \
+  do {                              \
+  } while (0)
+#else
+#if ARCH_X86 || ARCH_X86_64
+// Use a compiler barrier on x86, no runtime penalty.
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("" ::: "memory")
+#elif ARCH_ARM
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("dmb ish" ::: "memory")
+#elif ARCH_MIPS
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("sync" ::: "memory")
+#else
+#error Unsupported architecture!
+#endif  // ARCH_X86 || ARCH_X86_64
+#endif  // defined(_MSC_VER)
+#endif  // atomic builtin availability check
+
+// These are wrapped in a struct so that they are not easily accessed directly
+// on any platform (to discourage programmer errors by setting values directly).
+// This primitive MUST be initialized using vpx_atomic_init or VPX_ATOMIC_INIT
+// (NOT memset) and accessed through vpx_atomic_ functions.
+typedef struct vpx_atomic_int { volatile int value; } vpx_atomic_int;
+
+#define VPX_ATOMIC_INIT(num) \
+  { num }
+
+// Initialization of an atomic int, not thread safe.
+static INLINE void vpx_atomic_init(vpx_atomic_int *atomic, int value) {
+  atomic->value = value;
+}
+
+static INLINE void vpx_atomic_store_release(vpx_atomic_int *atomic, int value) {
+#if defined(VPX_USE_ATOMIC_BUILTINS)
+  __atomic_store_n(&atomic->value, value, __ATOMIC_RELEASE);
+#else
+  vpx_atomic_memory_barrier();
+  atomic->value = value;
+#endif  // defined(VPX_USE_ATOMIC_BUILTINS)
+}
+
+static INLINE int vpx_atomic_load_acquire(const vpx_atomic_int *atomic) {
+#if defined(VPX_USE_ATOMIC_BUILTINS)
+  return __atomic_load_n(&atomic->value, __ATOMIC_ACQUIRE);
+#else
+  int v = atomic->value;
+  vpx_atomic_memory_barrier();
+  return v;
+#endif  // defined(VPX_USE_ATOMIC_BUILTINS)
+}
+
+#undef VPX_USE_ATOMIC_BUILTINS
+#undef vpx_atomic_memory_barrier
+
+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // VPX_UTIL_VPX_ATOMICS_H_
diff --git a/libs/libvpx/vpx_util/vpx_util.mk b/libs/libvpx/vpx_util/vpx_util.mk
index c0ef8d3362..86d3ece3c8 100644
--- a/libs/libvpx/vpx_util/vpx_util.mk
+++ b/libs/libvpx/vpx_util/vpx_util.mk
@@ -8,7 +8,10 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
+UTIL_SRCS-yes += vpx_atomics.h
 UTIL_SRCS-yes += vpx_util.mk
 UTIL_SRCS-yes += vpx_thread.c
 UTIL_SRCS-yes += vpx_thread.h
 UTIL_SRCS-yes += endian_inl.h
+UTIL_SRCS-yes += vpx_write_yuv_frame.h
+UTIL_SRCS-yes += vpx_write_yuv_frame.c
diff --git a/libs/libvpx/vpx_util/vpx_write_yuv_frame.c b/libs/libvpx/vpx_util/vpx_write_yuv_frame.c
new file mode 100644
index 0000000000..ab68558115
--- /dev/null
+++ b/libs/libvpx/vpx_util/vpx_write_yuv_frame.c
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
+
+void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
+#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) || \
+    defined(OUTPUT_YUV_SKINMAP)
+
+  unsigned char *src = s->y_buffer;
+  int h = s->y_crop_height;
+
+  do {
+    fwrite(src, s->y_width, 1, yuv_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_crop_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_crop_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+
+#else
+  (void)yuv_file;
+  (void)s;
+#endif
+}
diff --git a/libs/libvpx/vpx_util/vpx_write_yuv_frame.h b/libs/libvpx/vpx_util/vpx_write_yuv_frame.h
new file mode 100644
index 0000000000..1cb7029817
--- /dev/null
+++ b/libs/libvpx/vpx_util/vpx_write_yuv_frame.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
+#define VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
+
+#include <stdio.h>
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
diff --git a/libs/libvpx/vpxdec.c b/libs/libvpx/vpxdec.c
index ab638ec6b8..ff20e6a3c9 100644
--- a/libs/libvpx/vpxdec.c
+++ b/libs/libvpx/vpxdec.c
@@ -9,11 +9,11 @@
  */
 
 #include <assert.h>
+#include <limits.h>
+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdarg.h>
 #include <string.h>
-#include <limits.h>
 
 #include "./vpx_config.h"
 
@@ -47,6 +47,8 @@ struct VpxDecInputContext {
   struct WebmInputContext *webm_ctx;
 };
 
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
 static const arg_def_t looparg =
     ARG_DEF(NULL, "loops", 1, "Number of times to decode the file");
 static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use");
@@ -75,7 +77,7 @@ static const arg_def_t outputfile =
 static const arg_def_t threadsarg =
     ARG_DEF("t", "threads", 1, "Max threads to use");
 static const arg_def_t frameparallelarg =
-    ARG_DEF(NULL, "frame-parallel", 0, "Frame parallel decode");
+    ARG_DEF(NULL, "frame-parallel", 0, "Frame parallel decode (ignored)");
 static const arg_def_t verbosearg =
     ARG_DEF("v", "verbose", 0, "Show version string");
 static const arg_def_t error_concealment =
@@ -92,31 +94,24 @@ static const arg_def_t md5arg =
 static const arg_def_t outbitdeptharg =
     ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
 #endif
+static const arg_def_t svcdecodingarg = ARG_DEF(
+    NULL, "svc-decode-layer", 1, "Decode SVC stream up to given spatial layer");
+static const arg_def_t framestatsarg =
+    ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)");
 
-static const arg_def_t *all_args[] = { &codecarg,
-                                       &use_yv12,
-                                       &use_i420,
-                                       &flipuvarg,
-                                       &rawvideo,
-                                       &noblitarg,
-                                       &progressarg,
-                                       &limitarg,
-                                       &skiparg,
-                                       &postprocarg,
-                                       &summaryarg,
-                                       &outputfile,
-                                       &threadsarg,
-                                       &frameparallelarg,
-                                       &verbosearg,
-                                       &scalearg,
-                                       &fb_arg,
-                                       &md5arg,
-                                       &error_concealment,
-                                       &continuearg,
+static const arg_def_t *all_args[] = {
+  &help,           &codecarg,          &use_yv12,
+  &use_i420,       &flipuvarg,         &rawvideo,
+  &noblitarg,      &progressarg,       &limitarg,
+  &skiparg,        &postprocarg,       &summaryarg,
+  &outputfile,     &threadsarg,        &frameparallelarg,
+  &verbosearg,     &scalearg,          &fb_arg,
+  &md5arg,         &error_concealment, &continuearg,
 #if CONFIG_VP9_HIGHBITDEPTH
-                                       &outbitdeptharg,
+  &outbitdeptharg,
 #endif
-                                       NULL };
+  &svcdecodingarg, &framestatsarg,     NULL
+};
 
 #if CONFIG_VP8_DECODER
 static const arg_def_t addnoise_level =
@@ -159,41 +154,47 @@ static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst,
                    dst->d_h, mode);
 }
 #endif
-
-void usage_exit(void) {
+void show_help(FILE *fout, int shorthelp) {
   int i;
 
-  fprintf(stderr,
-          "Usage: %s <options> filename\n\n"
-          "Options:\n",
-          exec_name);
-  arg_show_usage(stderr, all_args);
+  fprintf(fout, "Usage: %s <options> filename\n\n", exec_name);
+
+  if (shorthelp) {
+    fprintf(fout, "Use --help to see the full list of options.\n");
+    return;
+  }
+
+  fprintf(fout, "Options:\n");
+  arg_show_usage(fout, all_args);
 #if CONFIG_VP8_DECODER
-  fprintf(stderr, "\nVP8 Postprocessing Options:\n");
-  arg_show_usage(stderr, vp8_pp_args);
+  fprintf(fout, "\nVP8 Postprocessing Options:\n");
+  arg_show_usage(fout, vp8_pp_args);
 #endif
-  fprintf(stderr,
+  fprintf(fout,
           "\nOutput File Patterns:\n\n"
           "  The -o argument specifies the name of the file(s) to "
           "write to. If the\n  argument does not include any escape "
           "characters, the output will be\n  written to a single file. "
           "Otherwise, the filename will be calculated by\n  expanding "
           "the following escape characters:\n");
-  fprintf(stderr,
+  fprintf(fout,
           "\n\t%%w   - Frame width"
           "\n\t%%h   - Frame height"
           "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)"
           "\n\n  Pattern arguments are only supported in conjunction "
           "with the --yv12 and\n  --i420 options. If the -o option is "
           "not specified, the output will be\n  directed to stdout.\n");
-  fprintf(stderr, "\nIncluded decoders:\n\n");
+  fprintf(fout, "\nIncluded decoders:\n\n");
 
   for (i = 0; i < get_vpx_decoder_count(); ++i) {
     const VpxInterface *const decoder = get_vpx_decoder_by_index(i);
-    fprintf(stderr, "    %-6s - %s\n", decoder->name,
+    fprintf(fout, "    %-6s - %s\n", decoder->name,
             vpx_codec_iface_name(decoder->codec_interface()));
   }
+}
 
+void usage_exit(void) {
+  show_help(stderr, 1);
   exit(EXIT_FAILURE);
 }
 
@@ -495,11 +496,12 @@ static int main_loop(int argc, const char **argv_) {
   vpx_codec_ctx_t decoder;
   char *fn = NULL;
   int i;
+  int ret = EXIT_FAILURE;
   uint8_t *buf = NULL;
   size_t bytes_in_buffer = 0, buffer_size = 0;
   FILE *infile;
   int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
-  int do_md5 = 0, progress = 0, frame_parallel = 0;
+  int do_md5 = 0, progress = 0;
   int stop_after = 0, postproc = 0, summary = 0, quiet = 1;
   int arg_skip = 0;
   int ec_enabled = 0;
@@ -518,6 +520,8 @@ static int main_loop(int argc, const char **argv_) {
 #if CONFIG_VP9_HIGHBITDEPTH
   unsigned int output_bit_depth = 0;
 #endif
+  int svc_decoding = 0;
+  int svc_spatial_layer = 0;
 #if CONFIG_VP8_DECODER
   vp8_postproc_cfg_t vp8_pp_cfg = { 0, 0, 0 };
 #endif
@@ -536,6 +540,8 @@ static int main_loop(int argc, const char **argv_) {
   char outfile_name[PATH_MAX] = { 0 };
   FILE *outfile = NULL;
 
+  FILE *framestats_file = NULL;
+
   MD5Context md5_ctx;
   unsigned char md5_digest[16];
 
@@ -556,7 +562,10 @@ static int main_loop(int argc, const char **argv_) {
     memset(&arg, 0, sizeof(arg));
     arg.argv_step = 1;
 
-    if (arg_match(&arg, &codecarg, argi)) {
+    if (arg_match(&arg, &help, argi)) {
+      show_help(stdout, 0);
+      exit(EXIT_SUCCESS);
+    } else if (arg_match(&arg, &codecarg, argi)) {
       interface = get_vpx_decoder_by_name(arg.val);
       if (!interface)
         die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
@@ -593,8 +602,9 @@ static int main_loop(int argc, const char **argv_) {
     else if (arg_match(&arg, &threadsarg, argi))
       cfg.threads = arg_parse_uint(&arg);
 #if CONFIG_VP9_DECODER
-    else if (arg_match(&arg, &frameparallelarg, argi))
-      frame_parallel = 1;
+    else if (arg_match(&arg, &frameparallelarg, argi)) {
+      /* ignored for compatibility */
+    }
 #endif
     else if (arg_match(&arg, &verbosearg, argi))
       quiet = 0;
@@ -609,6 +619,16 @@ static int main_loop(int argc, const char **argv_) {
       output_bit_depth = arg_parse_uint(&arg);
     }
 #endif
+    else if (arg_match(&arg, &svcdecodingarg, argi)) {
+      svc_decoding = 1;
+      svc_spatial_layer = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &framestatsarg, argi)) {
+      framestats_file = fopen(arg.val, "w");
+      if (!framestats_file) {
+        die("Error: Could not open --framestats file (%s) for writing.\n",
+            arg.val);
+      }
+    }
 #if CONFIG_VP8_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
       postproc = 1;
@@ -642,6 +662,7 @@ static int main_loop(int argc, const char **argv_) {
 
   if (!fn) {
     free(argv);
+    fprintf(stderr, "No input file specified!\n");
     usage_exit();
   }
   /* Open file */
@@ -717,15 +738,21 @@ static int main_loop(int argc, const char **argv_) {
   if (!interface) interface = get_vpx_decoder_by_index(0);
 
   dec_flags = (postproc ? VPX_CODEC_USE_POSTPROC : 0) |
-              (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0) |
-              (frame_parallel ? VPX_CODEC_USE_FRAME_THREADING : 0);
+              (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0);
   if (vpx_codec_dec_init(&decoder, interface->codec_interface(), &cfg,
                          dec_flags)) {
     fprintf(stderr, "Failed to initialize decoder: %s\n",
             vpx_codec_error(&decoder));
-    return EXIT_FAILURE;
+    goto fail2;
+  }
+  if (svc_decoding) {
+    if (vpx_codec_control(&decoder, VP9_DECODE_SVC_SPATIAL_LAYER,
+                          svc_spatial_layer)) {
+      fprintf(stderr, "Failed to set spatial layer for svc decode: %s\n",
+              vpx_codec_error(&decoder));
+      goto fail;
+    }
   }
-
   if (!quiet) fprintf(stderr, "%s\n", decoder.name);
 
 #if CONFIG_VP8_DECODER
@@ -733,7 +760,7 @@ static int main_loop(int argc, const char **argv_) {
       vpx_codec_control(&decoder, VP8_SET_POSTPROC, &vp8_pp_cfg)) {
     fprintf(stderr, "Failed to configure postproc: %s\n",
             vpx_codec_error(&decoder));
-    return EXIT_FAILURE;
+    goto fail;
   }
 #endif
 
@@ -752,13 +779,15 @@ static int main_loop(int argc, const char **argv_) {
                                              &ext_fb_list)) {
       fprintf(stderr, "Failed to configure external frame buffers: %s\n",
               vpx_codec_error(&decoder));
-      return EXIT_FAILURE;
+      goto fail;
     }
   }
 
   frame_avail = 1;
   got_data = 0;
 
+  if (framestats_file) fprintf(framestats_file, "bytes,qp\n");
+
   /* Decode file */
   while (frame_avail || got_data) {
     vpx_codec_iter_t iter = NULL;
@@ -779,11 +808,21 @@ static int main_loop(int argc, const char **argv_) {
           const char *detail = vpx_codec_error_detail(&decoder);
           warn("Failed to decode frame %d: %s", frame_in,
                vpx_codec_error(&decoder));
-
           if (detail) warn("Additional information: %s", detail);
+          corrupted = 1;
           if (!keep_going) goto fail;
         }
 
+        if (framestats_file) {
+          int qp;
+          if (vpx_codec_control(&decoder, VPXD_GET_LAST_QUANTIZER, &qp)) {
+            warn("Failed VPXD_GET_LAST_QUANTIZER: %s",
+                 vpx_codec_error(&decoder));
+            if (!keep_going) goto fail;
+          }
+          fprintf(framestats_file, "%d,%d\n", (int)bytes_in_buffer, qp);
+        }
+
         vpx_usec_timer_mark(&timer);
         dx_time += vpx_usec_timer_elapsed(&timer);
       } else {
@@ -799,6 +838,8 @@ static int main_loop(int argc, const char **argv_) {
       // Flush the decoder in frame parallel decode.
       if (vpx_codec_decode(&decoder, NULL, 0, NULL, 0)) {
         warn("Failed to flush decoder: %s", vpx_codec_error(&decoder));
+        corrupted = 1;
+        if (!keep_going) goto fail;
       }
     }
 
@@ -811,7 +852,7 @@ static int main_loop(int argc, const char **argv_) {
     vpx_usec_timer_mark(&timer);
     dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
 
-    if (!frame_parallel &&
+    if (!corrupted &&
         vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
       warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder));
       if (!keep_going) goto fail;
@@ -861,7 +902,7 @@ static int main_loop(int argc, const char **argv_) {
                   "Scaling is disabled in this configuration. "
                   "To enable scaling, configure with --enable-libyuv\n",
                   vpx_codec_error(&decoder));
-          return EXIT_FAILURE;
+          goto fail;
 #endif
         }
       }
@@ -948,7 +989,7 @@ static int main_loop(int argc, const char **argv_) {
         if (do_md5) {
           update_image_md5(img, planes, &md5_ctx);
         } else {
-          write_image_file(img, planes, outfile);
+          if (!corrupted) write_image_file(img, planes, outfile);
         }
       } else {
         generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w,
@@ -972,17 +1013,21 @@ static int main_loop(int argc, const char **argv_) {
     fprintf(stderr, "\n");
   }
 
-  if (frames_corrupted)
+  if (frames_corrupted) {
     fprintf(stderr, "WARNING: %d frames corrupted.\n", frames_corrupted);
+  } else {
+    ret = EXIT_SUCCESS;
+  }
 
 fail:
 
   if (vpx_codec_destroy(&decoder)) {
     fprintf(stderr, "Failed to destroy decoder: %s\n",
             vpx_codec_error(&decoder));
-    return EXIT_FAILURE;
   }
 
+fail2:
+
   if (!noblit && single_file) {
     if (do_md5) {
       MD5Final(md5_digest, &md5_ctx);
@@ -1010,9 +1055,11 @@ fail:
   free(ext_fb_list.ext_fb);
 
   fclose(infile);
+  if (framestats_file) fclose(framestats_file);
+
   free(argv);
 
-  return frames_corrupted ? EXIT_FAILURE : EXIT_SUCCESS;
+  return ret;
 }
 
 int main(int argc, const char **argv_) {
diff --git a/libs/libvpx/vpxenc.c b/libs/libvpx/vpxenc.c
index 6e0af57a42..4db7eccc35 100644
--- a/libs/libvpx/vpxenc.c
+++ b/libs/libvpx/vpxenc.c
@@ -123,6 +123,8 @@ static int fourcc_is_ivf(const char detect[4]) {
   return 0;
 }
 
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
 static const arg_def_t debugmode =
     ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)");
 static const arg_def_t outputfile =
@@ -199,7 +201,8 @@ static const arg_def_t test16bitinternalarg = ARG_DEF(
     NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer");
 #endif
 
-static const arg_def_t *main_args[] = { &debugmode,
+static const arg_def_t *main_args[] = { &help,
+                                        &debugmode,
                                         &outputfile,
                                         &codecarg,
                                         &passes,
@@ -321,8 +324,11 @@ static const arg_def_t minsection_pct =
     ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)");
 static const arg_def_t maxsection_pct =
     ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
-static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct,
-                                              &maxsection_pct, NULL };
+static const arg_def_t corpus_complexity =
+    ARG_DEF(NULL, "corpus-complexity", 1, "corpus vbr complexity midpoint");
+static const arg_def_t *rc_twopass_args[] = {
+  &bias_pct, &minsection_pct, &maxsection_pct, &corpus_complexity, NULL
+};
 
 static const arg_def_t kf_min_dist =
     ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)");
@@ -355,6 +361,8 @@ static const arg_def_t cq_level =
     ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level");
 static const arg_def_t max_intra_rate_pct =
     ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)");
+static const arg_def_t gf_cbr_boost_pct = ARG_DEF(
+    NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)");
 
 #if CONFIG_VP8_ENCODER
 static const arg_def_t cpu_used_vp8 =
@@ -363,12 +371,21 @@ static const arg_def_t token_parts =
     ARG_DEF(NULL, "token-parts", 1, "Number of token partitions to use, log2");
 static const arg_def_t screen_content_mode =
     ARG_DEF(NULL, "screen-content-mode", 1, "Screen content mode");
-static const arg_def_t *vp8_args[] = {
-  &cpu_used_vp8,        &auto_altref, &noise_sens,     &sharpness,
-  &static_thresh,       &token_parts, &arnr_maxframes, &arnr_strength,
-  &arnr_type,           &tune_ssim,   &cq_level,       &max_intra_rate_pct,
-  &screen_content_mode, NULL
-};
+static const arg_def_t *vp8_args[] = { &cpu_used_vp8,
+                                       &auto_altref,
+                                       &noise_sens,
+                                       &sharpness,
+                                       &static_thresh,
+                                       &token_parts,
+                                       &arnr_maxframes,
+                                       &arnr_strength,
+                                       &arnr_type,
+                                       &tune_ssim,
+                                       &cq_level,
+                                       &max_intra_rate_pct,
+                                       &gf_cbr_boost_pct,
+                                       &screen_content_mode,
+                                       NULL };
 static const int vp8_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
                                         VP8E_SET_ENABLEAUTOALTREF,
                                         VP8E_SET_NOISE_SENSITIVITY,
@@ -381,6 +398,7 @@ static const int vp8_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
                                         VP8E_SET_TUNING,
                                         VP8E_SET_CQ_LEVEL,
                                         VP8E_SET_MAX_INTRA_BITRATE_PCT,
+                                        VP8E_SET_GF_CBR_BOOST_PCT,
                                         VP8E_SET_SCREEN_CONTENT_MODE,
                                         0 };
 #endif
@@ -407,8 +425,6 @@ static const arg_def_t alt_ref_aq = ARG_DEF(NULL, "alt-ref-aq", 1,
 static const arg_def_t frame_periodic_boost =
     ARG_DEF(NULL, "frame-boost", 1,
             "Enable frame periodic boost (0: off (default), 1: on)");
-static const arg_def_t gf_cbr_boost_pct = ARG_DEF(
-    NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)");
 static const arg_def_t max_inter_rate_pct =
     ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)");
 static const arg_def_t min_gf_interval = ARG_DEF(
@@ -431,8 +447,8 @@ static const struct arg_enum_list color_space_enum[] = {
 };
 
 static const arg_def_t input_color_space =
-    ARG_DEF_ENUM(NULL, "color-space", 1, "The color space of input content:",
-                 color_space_enum);
+    ARG_DEF_ENUM(NULL, "color-space", 1,
+                 "The color space of input content:", color_space_enum);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
@@ -450,6 +466,7 @@ static const arg_def_t inbitdeptharg =
 static const struct arg_enum_list tune_content_enum[] = {
   { "default", VP9E_CONTENT_DEFAULT },
   { "screen", VP9E_CONTENT_SCREEN },
+  { "film", VP9E_CONTENT_FILM },
   { NULL, 0 }
 };
 
@@ -458,8 +475,18 @@ static const arg_def_t tune_content = ARG_DEF_ENUM(
 
 static const arg_def_t target_level = ARG_DEF(
     NULL, "target-level", 1,
-    "Target level (255: off (default); 0: only keep level stats; 10: level 1.0;"
-    " 11: level 1.1; ... 62: level 6.2)");
+    "Target level\n"
+    "                                        255: off (default)\n"
+    "                                          0: only keep level stats\n"
+    "                                          1: adaptively set alt-ref "
+    "distance and column tile limit based on picture size, and keep"
+    " level stats\n"
+    "                                         10: level 1.0  11: level 1.1  "
+    "...  62: level 6.2");
+
+static const arg_def_t row_mt =
+    ARG_DEF(NULL, "row-mt", 1,
+            "Enable row based non-deterministic multi-threading in VP9");
 #endif
 
 #if CONFIG_VP9_ENCODER
@@ -488,6 +515,7 @@ static const arg_def_t *vp9_args[] = { &cpu_used_vp9,
                                        &min_gf_interval,
                                        &max_gf_interval,
                                        &target_level,
+                                       &row_mt,
 #if CONFIG_VP9_HIGHBITDEPTH
                                        &bitdeptharg,
                                        &inbitdeptharg,
@@ -518,51 +546,60 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
                                         VP9E_SET_MIN_GF_INTERVAL,
                                         VP9E_SET_MAX_GF_INTERVAL,
                                         VP9E_SET_TARGET_LEVEL,
+                                        VP9E_SET_ROW_MT,
                                         0 };
 #endif
 
 static const arg_def_t *no_args[] = { NULL };
 
-void usage_exit(void) {
+void show_help(FILE *fout, int shorthelp) {
   int i;
   const int num_encoder = get_vpx_encoder_count();
 
-  fprintf(stderr, "Usage: %s <options> -o dst_filename src_filename \n",
+  fprintf(fout, "Usage: %s <options> -o dst_filename src_filename \n",
           exec_name);
 
-  fprintf(stderr, "\nOptions:\n");
-  arg_show_usage(stderr, main_args);
-  fprintf(stderr, "\nEncoder Global Options:\n");
-  arg_show_usage(stderr, global_args);
-  fprintf(stderr, "\nRate Control Options:\n");
-  arg_show_usage(stderr, rc_args);
-  fprintf(stderr, "\nTwopass Rate Control Options:\n");
-  arg_show_usage(stderr, rc_twopass_args);
-  fprintf(stderr, "\nKeyframe Placement Options:\n");
-  arg_show_usage(stderr, kf_args);
+  if (shorthelp) {
+    fprintf(fout, "Use --help to see the full list of options.\n");
+    return;
+  }
+
+  fprintf(fout, "\nOptions:\n");
+  arg_show_usage(fout, main_args);
+  fprintf(fout, "\nEncoder Global Options:\n");
+  arg_show_usage(fout, global_args);
+  fprintf(fout, "\nRate Control Options:\n");
+  arg_show_usage(fout, rc_args);
+  fprintf(fout, "\nTwopass Rate Control Options:\n");
+  arg_show_usage(fout, rc_twopass_args);
+  fprintf(fout, "\nKeyframe Placement Options:\n");
+  arg_show_usage(fout, kf_args);
 #if CONFIG_VP8_ENCODER
-  fprintf(stderr, "\nVP8 Specific Options:\n");
-  arg_show_usage(stderr, vp8_args);
+  fprintf(fout, "\nVP8 Specific Options:\n");
+  arg_show_usage(fout, vp8_args);
 #endif
 #if CONFIG_VP9_ENCODER
-  fprintf(stderr, "\nVP9 Specific Options:\n");
-  arg_show_usage(stderr, vp9_args);
+  fprintf(fout, "\nVP9 Specific Options:\n");
+  arg_show_usage(fout, vp9_args);
 #endif
-  fprintf(stderr,
+  fprintf(fout,
           "\nStream timebase (--timebase):\n"
           "  The desired precision of timestamps in the output, expressed\n"
           "  in fractional seconds. Default is 1/1000.\n");
-  fprintf(stderr, "\nIncluded encoders:\n\n");
+  fprintf(fout, "\nIncluded encoders:\n\n");
 
   for (i = 0; i < num_encoder; ++i) {
     const VpxInterface *const encoder = get_vpx_encoder_by_index(i);
     const char *defstr = (i == (num_encoder - 1)) ? "(default)" : "";
-    fprintf(stderr, "    %-6s - %s %s\n", encoder->name,
+    fprintf(fout, "    %-6s - %s %s\n", encoder->name,
             vpx_codec_iface_name(encoder->codec_interface()), defstr);
   }
-  fprintf(stderr, "\n        ");
-  fprintf(stderr, "Use --codec to switch to a non-default encoder.\n\n");
+  fprintf(fout, "\n        ");
+  fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n");
+}
 
+void usage_exit(void) {
+  show_help(stderr, 1);
   exit(EXIT_FAILURE);
 }
 
@@ -877,7 +914,10 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     arg.argv_step = 1;
 
-    if (arg_match(&arg, &codecarg, argi)) {
+    if (arg_match(&arg, &help, argi)) {
+      show_help(stdout, 0);
+      exit(EXIT_SUCCESS);
+    } else if (arg_match(&arg, &codecarg, argi)) {
       global->codec = get_vpx_encoder_by_name(arg.val);
       if (!global->codec)
         die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
@@ -1211,6 +1251,11 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
     } else if (arg_match(&arg, &maxsection_pct, argi)) {
       config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg);
 
+      if (global->passes < 2)
+        warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &corpus_complexity, argi)) {
+      config->cfg.rc_2pass_vbr_corpus_complexity = arg_parse_uint(&arg);
+
       if (global->passes < 2)
         warn("option %s ignored in one-pass mode.\n", arg.name);
     } else if (arg_match(&arg, &kf_min_dist, argi)) {
@@ -1409,6 +1454,7 @@ static void show_stream_config(struct stream_state *stream,
   SHOW(rc_2pass_vbr_bias_pct);
   SHOW(rc_2pass_vbr_minsection_pct);
   SHOW(rc_2pass_vbr_maxsection_pct);
+  SHOW(rc_2pass_vbr_corpus_complexity);
   SHOW(kf_mode);
   SHOW(kf_min_dist);
   SHOW(kf_max_dist);
@@ -1647,7 +1693,7 @@ static void get_cx_data(struct stream_state *stream,
   *got_data = 0;
   while ((pkt = vpx_codec_get_cx_data(&stream->encoder, &iter))) {
     static size_t fsize = 0;
-    static int64_t ivf_header_pos = 0;
+    static FileOffset ivf_header_pos = 0;
 
     switch (pkt->kind) {
       case VPX_CODEC_CX_FRAME_PKT:
@@ -1673,7 +1719,7 @@ static void get_cx_data(struct stream_state *stream,
             fsize += pkt->data.frame.sz;
 
             if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) {
-              const int64_t currpos = ftello(stream->file);
+              const FileOffset currpos = ftello(stream->file);
               fseeko(stream->file, ivf_header_pos, SEEK_SET);
               ivf_write_frame_size(stream->file, fsize);
               fseeko(stream->file, currpos, SEEK_SET);
@@ -1873,8 +1919,6 @@ int main(int argc, const char **argv_) {
   memset(&input, 0, sizeof(input));
   exec_name = argv_[0];
 
-  if (argc < 3) usage_exit();
-
   /* Setup default input stream settings */
   input.framerate.numerator = 30;
   input.framerate.denominator = 1;
@@ -1888,6 +1932,8 @@ int main(int argc, const char **argv_) {
   argv = argv_dup(argc - 1, argv_ + 1);
   parse_global_config(&global, argv);
 
+  if (argc < 3) usage_exit();
+
   switch (global.color_type) {
     case I420: input.fmt = VPX_IMG_FMT_I420; break;
     case I422: input.fmt = VPX_IMG_FMT_I422; break;
@@ -1921,7 +1967,10 @@ int main(int argc, const char **argv_) {
   /* Handle non-option arguments */
   input.filename = argv[0];
 
-  if (!input.filename) usage_exit();
+  if (!input.filename) {
+    fprintf(stderr, "No input file specified!\n");
+    usage_exit();
+  }
 
   /* Decide if other chroma subsamplings than 4:2:0 are supported */
   if (global.codec->fourcc == VP9_FOURCC) input.only_i420 = 0;
diff --git a/libs/libvpx/webmdec.cc b/libs/libvpx/webmdec.cc
index ed4bd700dd..d609075a93 100644
--- a/libs/libvpx/webmdec.cc
+++ b/libs/libvpx/webmdec.cc
@@ -165,10 +165,11 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
     }
     if (get_new_block) {
       block = block_entry->GetBlock();
+      if (block == NULL) return -1;
       webm_ctx->block_frame_index = 0;
     }
-  } while (block->GetTrackNumber() != webm_ctx->video_track_index ||
-           block_entry_eos);
+  } while (block_entry_eos ||
+           block->GetTrackNumber() != webm_ctx->video_track_index);
 
   webm_ctx->cluster = cluster;
   webm_ctx->block_entry = block_entry;
diff --git a/libs/libvpx/y4menc.c b/libs/libvpx/y4menc.c
index e26fcaf6ea..05018dbc43 100644
--- a/libs/libvpx/y4menc.c
+++ b/libs/libvpx/y4menc.c
@@ -19,9 +19,9 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height,
     case 8:
       color = fmt == VPX_IMG_FMT_444A
                   ? "C444alpha\n"
-                  : fmt == VPX_IMG_FMT_I444 ? "C444\n" : fmt == VPX_IMG_FMT_I422
-                                                             ? "C422\n"
-                                                             : "C420jpeg\n";
+                  : fmt == VPX_IMG_FMT_I444
+                        ? "C444\n"
+                        : fmt == VPX_IMG_FMT_I422 ? "C422\n" : "C420jpeg\n";
       break;
     case 9:
       color = fmt == VPX_IMG_FMT_I44416
diff --git a/libs/libvpx/y4minput.c b/libs/libvpx/y4minput.c
index acf7d69fe9..1de636cc0b 100644
--- a/libs/libvpx/y4minput.c
+++ b/libs/libvpx/y4minput.c
@@ -195,26 +195,29 @@ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst,
        window.*/
     for (x = 0; x < OC_MINI(_c_w, 2); x++) {
       _dst[x] = (unsigned char)OC_CLAMPI(
-          0, (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] +
-              35 * _src[OC_MINI(x + 1, _c_w - 1)] -
-              9 * _src[OC_MINI(x + 2, _c_w - 1)] +
-              _src[OC_MINI(x + 3, _c_w - 1)] + 64) >>
-                 7,
+          0,
+          (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] +
+           35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+           9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[OC_MINI(x + 3, _c_w - 1)] +
+           64) >>
+              7,
           255);
     }
     for (; x < _c_w - 3; x++) {
       _dst[x] = (unsigned char)OC_CLAMPI(
-          0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
-              35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >>
-                 7,
+          0,
+          (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+           35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >>
+              7,
           255);
     }
     for (; x < _c_w; x++) {
       _dst[x] = (unsigned char)OC_CLAMPI(
-          0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
-              35 * _src[OC_MINI(x + 1, _c_w - 1)] -
-              9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >>
-                 7,
+          0,
+          (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+           35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+           9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >>
+              7,
           255);
     }
     _dst += _c_w;
@@ -314,28 +317,31 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
         for (x = 0; x < c_w; x++) {
           for (y = 0; y < OC_MINI(c_h, 3); y++) {
             _dst[y * c_w] = (unsigned char)OC_CLAMPI(
-                0, (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] +
-                    35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] -
-                    17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
-                    4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >>
-                       7,
+                0,
+                (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] +
+                 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+                 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >>
+                    7,
                 255);
           }
           for (; y < c_h - 2; y++) {
             _dst[y * c_w] = (unsigned char)OC_CLAMPI(
-                0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
-                    35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
-                    17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >>
-                       7,
+                0,
+                (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+                 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >>
+                    7,
                 255);
           }
           for (; y < c_h; y++) {
             _dst[y * c_w] = (unsigned char)OC_CLAMPI(
-                0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
-                    35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
-                    17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
-                    4 * tmp[(c_h - 1) * c_w] + 64) >>
-                       7,
+                0,
+                (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+                 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+                 4 * tmp[(c_h - 1) * c_w] + 64) >>
+                    7,
                 255);
           }
           _dst++;
@@ -361,10 +367,11 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
           }
           for (; y < c_h - 3; y++) {
             _dst[y * c_w] = (unsigned char)OC_CLAMPI(
-                0, (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
-                    114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] -
-                    9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >>
-                       7,
+                0,
+                (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
+                 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] -
+                 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >>
+                    7,
                 255);
           }
           for (; y < c_h; y++) {
@@ -404,18 +411,20 @@ static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst,
   for (x = 0; x < _c_w; x++) {
     for (y = 0; y < OC_MINI(_c_h, 2); y += 2) {
       _dst[(y >> 1) * _c_w] =
-          OC_CLAMPI(0, (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] -
-                        17 * _src[OC_MINI(2, _c_h - 1) * _c_w] +
-                        3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >>
-                           7,
+          OC_CLAMPI(0,
+                    (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] -
+                     17 * _src[OC_MINI(2, _c_h - 1) * _c_w] +
+                     3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >>
+                        7,
                     255);
     }
     for (; y < _c_h - 3; y += 2) {
       _dst[(y >> 1) * _c_w] =
-          OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) -
-                        17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) +
-                        78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >>
-                           7,
+          OC_CLAMPI(0,
+                    (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) -
+                     17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) +
+                     78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >>
+                        7,
                     255);
     }
     for (; y < _c_h; y += 2) {
@@ -642,33 +651,38 @@ static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst,
          4-tap Mitchell window.*/
       for (x = 0; x < OC_MINI(c_w, 1); x++) {
         tmp[x << 1] = (unsigned char)OC_CLAMPI(
-            0, (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] -
-                _aux[OC_MINI(2, c_w - 1)] + 64) >>
-                   7,
+            0,
+            (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] -
+             _aux[OC_MINI(2, c_w - 1)] + 64) >>
+                7,
             255);
         tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
-            0, (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] -
-                5 * _aux[OC_MINI(2, c_w - 1)] + 64) >>
-                   7,
+            0,
+            (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] -
+             5 * _aux[OC_MINI(2, c_w - 1)] + 64) >>
+                7,
             255);
       }
       for (; x < c_w - 2; x++) {
         tmp[x << 1] =
-            (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x] +
-                                         18 * _aux[x + 1] - _aux[x + 2] + 64) >>
-                                            7,
+            (unsigned char)OC_CLAMPI(0,
+                                     (_aux[x - 1] + 110 * _aux[x] +
+                                      18 * _aux[x + 1] - _aux[x + 2] + 64) >>
+                                         7,
                                      255);
         tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
-            0, (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] -
-                5 * _aux[x + 2] + 64) >>
-                   7,
+            0,
+            (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] -
+             5 * _aux[x + 2] + 64) >>
+                7,
             255);
       }
       for (; x < c_w; x++) {
         tmp[x << 1] = (unsigned char)OC_CLAMPI(
-            0, (_aux[x - 1] + 110 * _aux[x] +
-                18 * _aux[OC_MINI(x + 1, c_w - 1)] - _aux[c_w - 1] + 64) >>
-                   7,
+            0,
+            (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[OC_MINI(x + 1, c_w - 1)] -
+             _aux[c_w - 1] + 64) >>
+                7,
             255);
         if ((x << 1 | 1) < dst_c_w) {
           tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
@@ -718,27 +732,29 @@ static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst,
     /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
     for (y = 0; y < c_h; y++) {
       for (x = 0; x < OC_MINI(c_w, 2); x += 2) {
-        tmp[x >> 1] =
-            OC_CLAMPI(0, (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] -
-                          17 * _aux[OC_MINI(2, c_w - 1)] +
-                          3 * _aux[OC_MINI(3, c_w - 1)] + 64) >>
-                             7,
-                      255);
+        tmp[x >> 1] = OC_CLAMPI(0,
+                                (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] -
+                                 17 * _aux[OC_MINI(2, c_w - 1)] +
+                                 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >>
+                                    7,
+                                255);
       }
       for (; x < c_w - 3; x += 2) {
-        tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[x + 3]) -
-                                    17 * (_aux[x - 1] + _aux[x + 2]) +
-                                    78 * (_aux[x] + _aux[x + 1]) + 64) >>
-                                       7,
+        tmp[x >> 1] = OC_CLAMPI(0,
+                                (3 * (_aux[x - 2] + _aux[x + 3]) -
+                                 17 * (_aux[x - 1] + _aux[x + 2]) +
+                                 78 * (_aux[x] + _aux[x + 1]) + 64) >>
+                                    7,
                                 255);
       }
       for (; x < c_w; x += 2) {
-        tmp[x >> 1] = OC_CLAMPI(
-            0, (3 * (_aux[x - 2] + _aux[c_w - 1]) -
-                17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) +
-                78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >>
-                   7,
-            255);
+        tmp[x >> 1] =
+            OC_CLAMPI(0,
+                      (3 * (_aux[x - 2] + _aux[c_w - 1]) -
+                       17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) +
+                       78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >>
+                          7,
+                      255);
       }
       tmp += dst_c_w;
       _aux += c_w;